Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27998,6 +27998,250 @@
reorderGRInstForDTVB: false
tailLoopOptA: false
tailLoopOptB: false
- 1LDSBuffer: 0
ActivationAlt: false
ActivationFuncCall: false
ActivationFused: true
AssertAIGreaterThanEqual: -1
AssertAILessThanEqual: -1
AssertFree0ElementMultiple: 1
AssertFree1ElementMultiple: 1
AssertSummationElementMultiple: 1
AssignedDerivedParameters: true
AssignedProblemIndependentDerivedParameters: true
BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x32_MI16t2mYsG6m-GyRPJ5-k2EHtMUrt1O4D6pzYx22nhyNKxs=
BufferLoad: true
BufferStore: true
CUCount: null
CUOccupancy: -1
ClusterLocalRead: 1
CodeObjectVersion: '4'
ConvertAfterDS: false
CustomKernelName: ''
DebugStreamK: 0
DepthU: 32
DirectToLds: true
DirectToLdsA: true
DirectToLdsB: true
DirectToVgprA: false
DirectToVgprB: false
DirectToVgprSparseMetadata: false
EdgeType: ShiftPtr
EnableF32XdlMathOp: true
EnableMatrixInstruction: true
ExpandPointerSwap: 0
ExpertSchedulingMode: 0
ForceDisableShadowInit: false
ForceUnrollSubIter: true
GlobalReadPerMfma: 1
GlobalReadVectorWidthA: 4
GlobalReadVectorWidthB: 4
GlobalSplitU: 0
GlobalSplitUAlgorithm: MultipleBuffer
GlobalSplitUCoalesced: false
GlobalSplitUWorkGroupMappingRoundRobin: false
GlobalWriteVectorWidth: 4
GroupLoadStore: false
GuaranteeNoPartialA: false
GuaranteeNoPartialB: true
GuaranteeNoPartialMetadata: true
ISA: [9, 5, 0]
InnerUnroll: 1
InterleaveAlpha: 0
InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true,
SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true}
Kernel: true
KernelLanguage: Assembly
KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1
LDSTrInst: 0
LSCA: 128
LSCB: 32
LSPA: 8
LSPB: 32
LVCA: 32
LVCB: 8
LVPA: 2
LVPB: 8
LdsBlockSizePerPadA: 1024
LdsBlockSizePerPadB: 1024
LdsBlockSizePerPadMetadata: 0
LdsBytesNoAmax: 107264
LdsInitCVgprs: false
LdsNumBytes: 107264
LdsNumElementsAlignedA: 16384
LdsNumElementsAlignedB: 25344
LdsNumElementsAlignedMetadata: 0
LdsOffsetA: 0
LdsOffsetA_Blk: 65536
LdsOffsetB: 16384
LdsOffsetB_Blk: 81920
LdsOffsetBias: 0
LdsOffsetBiasGSU: 0
LdsOffsetBiasNonGSU: 0
LdsOffsetMetadata: 16384
LdsOffsetMetadata_Blk: 81920
LdsPadA: 0
LdsPadB: 8
LdsPadMetadata: 0
LocalReadVectorWidth: 4
LocalSplitU: 1
LocalSplitUReuseLDS: 1
LocalWritePerMfma: -1
LocalWriteUseSgprA: true
LocalWriteUseSgprB: true
LoopIters: 1
LoopUnroll: 32
MFMA_BF16_1K: false
MIArchVgpr: false
MIBlock: [16, 16, 32, 1, 1, 1]
MIInputPerThread: 8
MIInputPerThreadA: 8
MIInputPerThreadB: 8
MIInputPerThreadMetadata: 8
MIOutputVectorWidth: 4
MIRegPerOut: 1
MIWaveGroup: [2, 2]
MIWaveTile: [4, 6]
MIWaveTileA: 4
MIWaveTileB: 6
MIWaveTileMetadata: 0
MacroTile0: 128
MacroTile1: 192
MacroTileA: 128
MacroTileB: 192
MagicDivAlg: 2
MathClocksUnrolledLoop: 0
MatrixInstB: 1
MatrixInstBM: 1
MatrixInstBN: 1
MatrixInstK: 32
MatrixInstM: 16
MatrixInstN: 16
MatrixInstruction: [16, 16, 32, 1]
MaxLDS: 163840
MaxOccupancy: 40
MbskPrefetchMethod: 0
MfmaInitCVgprs: true
NoLdsWriteCode: true
NoReject: false
NoTailLoop: false
NonDTLTailLoopA: false
NonDTLTailLoopB: false
NonTemporal: -1
NonTemporalA: 0
NonTemporalB: 0
NonTemporalC: 0
NonTemporalD: 0
NonTemporalE: 0
NonTemporalMetadata: 0
NonTemporalWS: 0
NumElementsPerBatchStore: 0
NumElementsPerThread: 96
NumGlobalWriteVectorsPerThread: 24
NumLoadsA: 4
NumLoadsB: 6
NumLoadsCoalescedA: 1
NumLoadsCoalescedB: 1
NumLoadsPerpendicularA: 4
NumLoadsPerpendicularB: 6
NumThreads: 256
NumTotalPackedLoadsA: 4
NumTotalPackedLoadsB: 6
NumWaveSplitK: 1
OptNoLoadLoop: 1
PackedC0IdxChars: [I]
PackedC0IndicesX: [0]
PackedC1IdxChars: [J]
PackedC1IndicesX: [1]
PrefetchGlobalRead: 2
PrefetchLocalRead: 0
PreloadKernArgs: true
SFCWGM:
- [1, 1]
- [1, 1]
ScheduleGlobalRead: 1
ScheduleIterAlg: 3
ScheduleLocalWrite: 1
SolutionIndex: 117
SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x32_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1
SourceSwap: false
SpaceFillingAlgo: []
StaggerU: 0
StaggerUMapping: 0
StaggerUStride: 128
StorePriorityOpt: false
StoreRemapVectorWidth: 0
StoreSwapAddr: false
StoreSyncOpt: 0
StoreVectorWidth: 4
StreamK: 3
StreamKAtomic: 0
StreamKFixupTreeReduction: 0
StreamKXCCMapping: 0
SubGroup0: 8
SubGroup1: 32
SubGroupA: 8
SubGroupB: 32
SuppressNoLoadLoop: false
SwapGlobalReadOrder: false
ThreadTile: [1, 1]
ThreadTile0: 16
ThreadTile1: 6
ThreadTileA: 16
ThreadTileB: 6
TransposeLDS: 1
TransposeLDSMetadata: true
ULSGRODoubleG2L: 0
UnrollLoopSwapGlobalReadOrder: 0
UnrollMajorLDSA: false
UnrollMajorLDSB: true
UnrollMajorLDSMetadata: true
Use64bShadowLimit: 1
UseCustomMainLoopSchedule: true
UseDirect32XEmulation: true
UseDot2F32XEmulation: false
UseDotInstruction: false
UseF32XEmulation: true
UseGeneralizedNLCOneA: true
UseGeneralizedNLCOneB: true
UseGeneralizedNLCOneMetadata: false
UseInstOffsetForGRO: 0
UsePLRPack: true
UseSgprForGRO: -1
Valid: true
VectorStore: -1
VectorWidthA: 4
VectorWidthB: 2
WaveSeparateGlobalReadA: 0
WaveSeparateGlobalReadB: 0
WaveSeparateGlobalReadMetadata: 0
WaveSplitK: false
WavefrontSize: 64
WorkGroup: [32, 8, 1]
WorkGroupMapping: 8
WorkGroupMappingXCC: 1
WorkGroupMappingXCCGroup: -1
WorkGroupReduction: false
WorkspaceCheck: [4, 0, 0]
_DepthU: 32
_DepthUA: 32
_DepthUB: 32
_DepthUMetadata: 32
_GlobalAccumulation: PartialsBuffer
_UseSgprForGRO: 0
_VectorStore: 1
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 0
enableGLTrA: false
enableGLTrB: false
enableLDSTrA: 0
enableLDSTrB: 0
numSubTiles: 2
reorderGRInstForDTVA: false
reorderGRInstForDTVB: false
tailLoopOptA: false
tailLoopOptB: false
- [2, 3, 0, 1]
- - - [4, 30, 8192, 128]
- [60, 0.0]
Expand Down Expand Up @@ -28233,6 +28477,8 @@
- [114, 135836.0]
- - [3072, 4096, 1, 8192]
- [116, 0.0]
- - [2048, 3072, 1, 8192]
- [117, 0.0]
- null
- null
- DeviceEfficiency
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2269,8 +2269,45 @@ def _get_schedule_128x192x32_TF32(kernel, useLDSTr, TLDS):
syncCode = []
nglshift = nllshift = 0 # vmcnt shift for ngl and nll
if isNN(kernel) and not useLDSTr and TLDS==1:
# TODO: Add NN schedule in upcoming PR
return False, None
kernel["UsePLRPack"] = True
syncTable = [
-1, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Begininng of a iteration. Wait for prior local read.") ,
10, SWaitCnt(dscnt=1, vlcnt=-1, vscnt=-1, comment="Before PackA0. Wait for first all LRA0. Skip 1*LRB0") ,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This wait is a bit conversative. The trace shows 100 clk wait on it. You could have intermediate waits as you don't need all LRA0 instructions for the first CVTs. I think you would need 6 ds_reads to complete the first 8 CVTs for example

17, SWaitCnt(dscnt=6, vlcnt=-1, vscnt=-1, comment="Before GRA. Wait for all prior LRA0 for GRA. Skip 6*LRB0") ,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be SWaitCnt(dscnt=5) according the trace as the 6th is done at the same mfma index and after this wait. Wasn't it caught by the validator ?

17, SBarrier(comment="GRA") ,
20, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Before PackB0. Wait for all prior LRB0.") ,
29, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Before GRB. Wait for all prior LRB0.") ,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: not needed given you already have a SWaitCnt(dscnt=0) at 20

29, SBarrier(comment="GRB") ,
35, SWaitCnt(dscnt=-1, vlcnt=6, vscnt=-1, comment="Before LRB3. Wait for GRB from previous iter. Skip 4*GRA + 2*GRB") ,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This actually waits for both GRA and GRB of previous iteration right ? So it looks like you second SWaitCnt(vlcnt=10) at index 53 is not necessary ?

35, SBarrier(comment="LRB") ,
44, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Before PackB3. Wait for all LRB3 for PackB3.") ,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same remark. You could reduce this wait by doing intermediate SWaitCnt

53, SWaitCnt(dscnt=-1, vlcnt=10, vscnt=-1, comment="Before LRA3. Wait for GRA from previous iter. Skip 4*GRA + 6*GRB") ,
53, SBarrier(comment="LRA") ,
64, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Before PackA3. Wait for all prior LRA3.") ,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. Possible intermediate SWaitCnt :)

]
optSchedule = {
'SYNC' : [syncTable[::2]],
'GRIncA': [[0, 0, 1, 1, 2, 2, 3, 3, 4]],
'GRIncB': [[4, 5, 5, 6, 6, 7, 7, 8, 8]],
'LRA0' : [[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7]],
'LRB0' : [[8, 10, 12, 14, 16, 17],
[9, 11, 13, 15, 16, 18]],
'PackA0': [create_range(10, 6, 17, 1, 8)],
'PackB0': [create_range(20, 12, 34, 1, 6)],
'GRA' : [[19, 20, 21, 22, 23, 24, 25, 26]],
'GRB' : [[29, 30, 31, 32, 52, 53, 54, 55, 56, 57, 58, 59]],
'LRB3' : [[35, 36, 37, 38, 39, 40]],
'LRA3' : [[53, 53, 54, 54, 55, 55, 56, 56, 57, 57, 58, 58, 59, 59, 60, 61]],
'PackB3': [create_range(44, 9, 54, 1, 8)],
'PackA3': [create_range(64, 6, 71, 1, 8)],
'LRSA' : [[28]],
'LRSB' : [[28]],
'LWSA' : [[60]],
'LWSB' : [[61]],
'LCC' : [[71, 71]],
}
syncCode = syncTable[1::2]
nglshift = nllshift = 10
elif isTN(kernel) and not useLDSTr and TLDS==1:
kernel["UsePLRPack"] = True
syncTable = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,32 @@ BenchmarkProblems:
- Range: [[192], [256], [1], [64, 64, 256]]
- Range: [[192], [256], [1], [1,1,64]]
- Range: [[192], [256], [1], [32, 64, 256]]
- BiasTypeArgs: ['b']
- BiasTypeArgs: ['b']
- # BenchmarkProblemSizeGroup - Standard - All problem
InitialSolutionParameters:
BenchmarkCommonParameters:
- KernelLanguage: ["Assembly"]
ForkParameters:
- MatrixInstruction:
- [16, 16, 32, 1, 1, 4, 6, 2, 2]
- DepthU: [32]
- LocalReadVectorWidth: [4]
- ScheduleIterAlg: [3]
- DirectToLds: [1]
- PrefetchGlobalRead: [2]
- PrefetchLocalRead: [1]
- UseCustomMainLoopSchedule: [1]
- StreamK: [3]
- StaggerU: [0]
- ClusterLocalRead: [1]
- TransposeLDS: [1]
- LDSTrInst: [0]
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Range: [[128], [192], [1], [64, 64, 256]]
- Range: [[128], [192], [1], [1,1,64]]
- Range: [[128], [192], [1], [32, 64, 256]]
- Range: [[4096], [6144], [1], [64, 64, 256]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we just want to keep small shapes in this test file (removing these 2 lines)

- Exact: [2048, 3072, 1, 8192]
- BiasTypeArgs: ['b']
Loading