Skip to content

Commit f118bca

Browse files
authored
refactor: refine prefetch statistics in topdown monitor (#452)
* chore(pf): add miss_count for l2topdownmonitor * refactor(pf): vectorize the selection of prefetchers * chore(pf): more detailed prefetch statistics * refactor: replace pfHit by hitPf to show verb and object attributes * submodule(utility): bump for new buskey * chore: update the utility in yml into master for synchronization * fix(slice): give default value due to different option of prefetch * Revert "chore: update the utility in yml into master for synchronization" This reverts commit 75ef181.
1 parent fffb491 commit f118bca

File tree

15 files changed

+225
-180
lines changed

15 files changed

+225
-180
lines changed

src/main/scala/coupledL2/BaseSlice.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,17 @@ abstract class BaseSliceIO[T_OUT <: BaseOuterBundle](implicit p: Parameters) ext
3232
val sliceId = Input(UInt(bankBits.W))
3333
val l1Hint = DecoupledIO(new L2ToL1Hint())
3434
val prefetch = prefetchOpt.map(_ => Flipped(new PrefetchIO))
35-
// val msStatus = topDownOpt.map(_ => Vec(mshrsAll, ValidIO(new MSHRStatus)))
3635
val dirResult = topDownOpt.map(_ => ValidIO(new DirResult))
37-
val latePF = topDownOpt.map(_ => ValidIO(UInt(PfSource.pfSourceBits.W)))
36+
val hitPfInMSHR = topDownOpt.map(_ => ValidIO(UInt(PfSource.pfSourceBits.W)))
37+
val pfSent = topDownOpt.map(_ => ValidIO(UInt(MemReqSource.reqSourceBits.W)))
38+
val pfLateInMSHR = topDownOpt.map(_ => ValidIO((UInt(MemReqSource.reqSourceBits.W))))
3839
val error = DecoupledIO(new L2CacheErrorInfo())
3940
val l2Miss = Output(Bool())
4041
val l2Flush = Option.when(cacheParams.enableL2Flush) (Input(Bool()))
4142
val l2FlushDone = Option.when(cacheParams.enableL2Flush) (Output(Bool()))
43+
// statistics
44+
val msStatus = topDownOpt.map(_ => Vec(mshrsAll, ValidIO(new MSHRStatus)))
45+
val msAlloc = topDownOpt.map(_ => Vec(mshrsAll, ValidIO(new MSHRAllocStatus)))
4246
}
4347

4448
abstract class BaseSlice[T_OUT <: BaseOuterBundle](implicit p: Parameters) extends L2Module with HasPerfEvents {

src/main/scala/coupledL2/Common.scala

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,48 @@ class PipeEntranceStatus(implicit p: Parameters) extends L2Bundle {
184184
def g_set = sets(3)
185185
}
186186

187+
/* MSHR info */
188+
189+
// MSHR exposes signals about allocation to Topdown
190+
class MSHRAllocStatus()(implicit p: Parameters) extends L2Bundle with HasTLChannelBits{
191+
val is_miss = Bool()
192+
val is_prefetch = Bool()
193+
}
194+
195+
// MSHR exposes signals to MSHRCtl and Topdown
196+
class MSHRStatus(implicit p: Parameters) extends L2Bundle
197+
with HasTLChannelBits
198+
with HasCHIChannelBits
199+
{
200+
val set = UInt(setBits.W)
201+
val reqTag = UInt(tagBits.W)
202+
val metaTag = UInt(tagBits.W)
203+
val needsRepl = Bool()
204+
val w_c_resp = Bool()
205+
val w_d_resp = Bool()
206+
val will_free = Bool()
207+
208+
/*
209+
val way = UInt(wayBits.W)
210+
val off = UInt(offsetBits.W)
211+
val opcode = UInt(3.W)
212+
val param = UInt(3.W)
213+
val size = UInt(msgSizeBits.W)
214+
val source = UInt(sourceIdBits.W)
215+
val alias = aliasBitsOpt.map(_ => UInt(aliasBitsOpt.get.W))
216+
val aliasTask = aliasBitsOpt.map(_ => Bool())
217+
val needProbeAckData = Bool() // only for B reqs
218+
val fromL2pft = prefetchOpt.map(_ => Bool())
219+
val needHint = prefetchOpt.map(_ => Bool())
220+
*/
221+
222+
// for TopDown usage
223+
val reqSource = UInt(MemReqSource.reqSourceBits.W)
224+
val is_miss = Bool()
225+
val is_prefetch = Bool()
226+
227+
}
228+
187229
// MSHR Task that MainPipe sends to MSHRCtl
188230
class MSHRRequest(implicit p: Parameters) extends L2Bundle {
189231
val dirResult = new DirResult()

src/main/scala/coupledL2/CoupledL2.scala

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -562,23 +562,19 @@ abstract class CoupledL2Base(implicit p: Parameters) extends LazyModule with Has
562562
case EdgeOutKey => node.out.head._2
563563
case BankBitsKey => bankBits
564564
})))
565-
topDown match {
566-
case Some(t) =>
567-
t.io.msStatus.zip(slices).foreach {
568-
case (in, s) =>
569-
s match {
570-
case slice: tl2tl.Slice => in := slice.io_msStatus.get
571-
case slice: tl2chi.Slice => in := slice.io_msStatus.get
572-
}
573-
}
574-
t.io.dirResult.zip(slices).foreach {
575-
case (res, s) => res := s.io.dirResult.get
576-
}
577-
t.io.latePF.zip(slices).foreach {
578-
case (in, s) => in := s.io.latePF.get
579-
}
580-
t.io.debugTopDown <> io.debugTopDown
581-
case None => io.debugTopDown.l2MissMatch := false.B
565+
topDown.foreach { t =>
566+
for ((s, i) <- slices.zipWithIndex) {
567+
t.io.msStatus(i) := s.io.msStatus.get
568+
t.io.msAlloc(i) := s.io.msAlloc.get
569+
t.io.dirResult(i) := s.io.dirResult.get
570+
t.io.hitPfInMSHR(i) := s.io.hitPfInMSHR.get
571+
t.io.pfLateInMSHR(i) := s.io.pfLateInMSHR.get
572+
t.io.pfSent(i) := s.io.pfSent.get
573+
}
574+
t.io.debugTopDown <> io.debugTopDown
575+
}
576+
when (topDown.isEmpty.B) {
577+
io.debugTopDown.l2MissMatch := false.B
582578
}
583579

584580
io.l2Miss := RegNext(slices.map(_.io.l2Miss).reduce(_ || _))

src/main/scala/coupledL2/RequestBuffer.scala

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete
8787
val set = UInt(setBits.W)
8888
}))
8989

90-
val hasLatePF = ValidIO(UInt(PfSource.pfSourceBits.W))
90+
val hasHitPfInMSHR = ValidIO(UInt(PfSource.pfSourceBits.W))
91+
val hasPfLateInMSHR = ValidIO(UInt(MemReqSource.reqSourceBits.W))
9192
val hasMergeA = Output(Bool())
9293
})
9394

@@ -178,10 +179,6 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete
178179
// flow not allowed when full, or entries might starve
179180
val canFlow = flow.B && !full && !conflict(in) && !chosenQValid && !Cat(io.mainPipeBlock).orR && !noFreeWay(in)
180181
val doFlow = canFlow && io.out.ready
181-
val latePrefetchRes = latePrefetch(in)
182-
io.hasLatePF.valid := latePrefetchRes._1 && io.in.valid && !sameAddr(in, RegNext(in))
183-
io.hasLatePF.bits := latePrefetchRes._2
184-
io.hasMergeA := mergeA && io.in.valid && !sameAddr(in, RegNext(in))
185182

186183
// val depMask = buffer.map(e => e.valid && sameAddr(io.in.bits, e.task))
187184
// remove duplicate prefetch if same-addr A req in MSHR or ReqBuf
@@ -195,6 +192,14 @@ class RequestBuffer(flow: Boolean = true, entries: Int = 4)(implicit p: Paramete
195192
).asUInt
196193
val dup = isPrefetch && dupMask.orR
197194

195+
// statistics io
196+
val latePrefetchRes = latePrefetch(in)
197+
io.hasHitPfInMSHR.valid := latePrefetchRes._1 && io.in.valid && !sameAddr(in, RegNext(in))
198+
io.hasHitPfInMSHR.bits := latePrefetchRes._2
199+
io.hasPfLateInMSHR.valid := io.in.valid && dup
200+
io.hasPfLateInMSHR.bits := io.in.bits.reqSource
201+
io.hasMergeA := mergeA && io.in.valid && !sameAddr(in, RegNext(in))
202+
198203
//!! TODO: we can also remove those that duplicate with mainPipe
199204

200205
/* ======== Alloc ======== */

src/main/scala/coupledL2/TopDownMonitor.scala

Lines changed: 69 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,18 @@ import chisel3._
2121
import chisel3.util._
2222
import coupledL2.prefetch.PfSource
2323
import coupledL2.utils._
24-
import coupledL2.tl2tl.MSHRStatus
2524
import utility._
2625

2726
// TODO: Accommodate CHI
2827
class TopDownMonitor()(implicit p: Parameters) extends L2Module {
2928
val banks = 1 << bankBits
3029
val io = IO(new Bundle() {
3130
val dirResult = Vec(banks, Flipped(ValidIO(new DirResult)))
32-
val msStatus = Vec(banks, Vec(mshrsAll, Flipped(ValidIO(new MSHRStatus))))
33-
val latePF = Vec(banks, Flipped(ValidIO(UInt(PfSource.pfSourceBits.W))))
31+
val msStatus = Vec(banks, Vec(mshrsAll, Flipped(ValidIO(new MSHRStatus))))
32+
val msAlloc = Vec(banks, Vec(mshrsAll, Flipped(ValidIO(new MSHRAllocStatus))))
33+
val hitPfInMSHR = Vec(banks, Flipped(ValidIO(UInt(PfSource.pfSourceBits.W))))
34+
val pfSent = Vec(banks, Flipped(ValidIO(UInt(MemReqSource.reqSourceBits.W))))
35+
val pfLateInMSHR = Vec(banks, Flipped(ValidIO(UInt(MemReqSource.reqSourceBits.W))))
3436
val debugTopDown = new Bundle {
3537
val robTrueCommit = Input(UInt(64.W))
3638
val robHeadPaddr = Flipped(Valid(UInt(36.W)))
@@ -55,28 +57,34 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
5557
}
5658

5759
io.debugTopDown.l2MissMatch := Cat(addrMatchVec.flatten).orR
58-
XSPerfAccumulate(s"${cacheParams.name}MissMatch", io.debugTopDown.l2MissMatch)
60+
XSPerfAccumulate(s"RobBlockBy${cacheParams.name}Miss", io.debugTopDown.l2MissMatch)
5961

6062
/* ====== PART TWO ======
6163
* Count the parallel misses, and divide them into CPU/Prefetch
6264
*/
63-
def allMSHRMatchVec(cond: MSHRStatus => Bool): IndexedSeq[Bool] = {
64-
io.msStatus.zipWithIndex.flatMap {
65-
case (slice, i) =>
65+
def allValidMatchVec[T <: Data](vec: Vec[Vec[ValidIO[T]]])(cond: T => Bool): IndexedSeq[Bool] = {
66+
vec.flatMap{
67+
case slice =>
6668
slice.map {
6769
ms => ms.valid && cond(ms.bits)
6870
}
6971
}
7072
}
7173

72-
val missVecCPU = allMSHRMatchVec(s => s.fromA && s.is_miss && !s.is_prefetch)
73-
val missVecPref = allMSHRMatchVec(s => s.fromA && s.is_miss && s.is_prefetch)
74-
// val missVecAll = allMSHRMatchVec(s => s.fromA && s.is_miss)
75-
74+
val missVecCPU = allValidMatchVec(io.msStatus)(s => s.fromA && s.is_miss && !s.is_prefetch)
75+
val missVecPref = allValidMatchVec(io.msStatus)(s => s.fromA && s.is_miss && s.is_prefetch)
76+
// val missVecAll = allValidMatchVec(io.msStatus)(s => s.fromA && s.is_miss)
7677
val totalMSHRs = banks * mshrsAll
77-
XSPerfHistogram("parallel_misses_CPU" , PopCount(missVecCPU), true.B, 0, totalMSHRs, 1)
78-
XSPerfHistogram("parallel_misses_Pref", PopCount(missVecPref), true.B, 0, totalMSHRs, 1)
79-
XSPerfHistogram("parallel_misses_All" , PopCount(missVecCPU)+PopCount(missVecPref), true.B, 0, 32, 1)
78+
XSPerfHistogram("mshr_cycles_CPU" , PopCount(missVecCPU), true.B, 0, totalMSHRs, 1)
79+
XSPerfHistogram("mshr_cycles_Prefetch", PopCount(missVecPref), true.B, 0, totalMSHRs, 1)
80+
XSPerfHistogram("mshr_cycles_All" , PopCount(missVecCPU)+PopCount(missVecPref), true.B, 0, totalMSHRs, 1)
81+
82+
// count the miss times
83+
val missCountCPU = allValidMatchVec(io.msAlloc)(s => s.fromA && s.is_miss && !s.is_prefetch)
84+
val missCountPref = allValidMatchVec(io.msAlloc)(s => s.fromA && s.is_miss && s.is_prefetch)
85+
XSPerfAccumulate("mshr_count_CPU", PopCount(missCountCPU))
86+
XSPerfAccumulate("mshr_count_Prefetch", PopCount(missCountPref))
87+
XSPerfAccumulate("mshr_count_All", PopCount(missCountCPU) + PopCount(missCountPref))
8088

8189
/* ====== PART THREE ======
8290
* Distinguish req sources and count num & miss
@@ -92,11 +100,6 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
92100
}
93101
}
94102

95-
def reqFromCPU(r: DirResult): Bool = {
96-
r.replacerInfo.reqSource === MemReqSource.CPULoadData.id.U ||
97-
r.replacerInfo.reqSource === MemReqSource.CPUStoreData.id.U
98-
}
99-
100103
for (i <- 0 until MemReqSource.ReqSourceCount.id) {
101104
val sourceMatchVec = dirResultMatchVec(r => r.replacerInfo.reqSource === i.U)
102105
val sourceMatchVecMiss = dirResultMatchVec(r => r.replacerInfo.reqSource === i.U && !r.hit)
@@ -120,43 +123,62 @@ class TopDownMonitor()(implicit p: Parameters) extends L2Module {
120123
)
121124

122125
// sent/useful vector
123-
val l2prefetchSentVec = pfTypes.map { case (_, reqSrc, _) => dirResultMatchVec(r => r.replacerInfo.reqSource === reqSrc) }
124-
val l2prefetchUsefulVec = pfTypes.map { case (_, _, pfSrc) =>
125-
dirResultMatchVec(r => reqFromCPU(r) && r.hit &&
126+
val l2pfSentVec = pfTypes.map { case (_, reqSrc, _) => io.pfSent.map(r => r.valid && r.bits === reqSrc) }
127+
val l2pfSentToPipeVec = pfTypes.map { case (_, reqSrc, _) => dirResultMatchVec(r => r.replacerInfo.reqSource === reqSrc) }
128+
val l2hitPfInCacheVec = pfTypes.map { case (_, _, pfSrc) =>
129+
dirResultMatchVec(r => MemReqSource.isCPUReq(r.replacerInfo.reqSource) && r.hit &&
126130
r.meta.prefetch.getOrElse(false.B) && r.meta.prefetchSrc.getOrElse(PfSource.NoWhere.id.U) === pfSrc)
127131
}
128-
val l2prefetchLateVec = pfTypes.map { case (_, _, pfSrc) =>
129-
io.latePF.map(r => r.valid && r.bits === pfSrc)
132+
val l2hitPfInMSHRVec = pfTypes.map { case (_, _, pfSrc) =>
133+
io.hitPfInMSHR.map(r => r.valid && r.bits === pfSrc)
130134
}
131-
132-
// to summary
133-
val l2prefetchSent = dirResultMatchVec(
134-
r => MemReqSource.isL2Prefetch(r.replacerInfo.reqSource)
135+
val l2pfLateInCache = pfTypes.map { case (_, reqSrc, _) =>
136+
dirResultMatchVec(r => MemReqSource.isL2Prefetch(r.replacerInfo.reqSource) && r.hit &&
137+
!r.meta.prefetch.getOrElse(false.B) && r.replacerInfo.reqSource === reqSrc)
138+
}
139+
val l2pfLateInMSHR = pfTypes.map { case (_, reqSrc, _) =>
140+
io.pfLateInMSHR.map(r => r.valid && r.bits === reqSrc)
141+
}
142+
val l2hitPfVec = l2hitPfInCacheVec.zip(l2hitPfInMSHRVec).map { case (c, m) => PopCount(c) + PopCount(m) }
143+
val l2pfLateVec = l2pfLateInCache.zip(l2pfLateInMSHR).map { case (c, m) => PopCount(c) + PopCount(m) }
144+
val l2demandMiss = dirResultMatchVec(
145+
r => MemReqSource.isCPUReq(r.replacerInfo.reqSource) && !r.hit
135146
)
136-
val l2prefetchUseful = dirResultMatchVec(
137-
r => reqFromCPU(r) && r.hit && r.meta.prefetch.getOrElse(false.B)
147+
val l2prefetchMiss = dirResultMatchVec(
148+
r => MemReqSource.isL2Prefetch(r.replacerInfo.reqSource) && !r.hit
138149
)
139-
val l2demandMiss = dirResultMatchVec(
140-
r => reqFromCPU(r) && !r.hit
150+
val l1prefetchMiss = dirResultMatchVec(
151+
r => MemReqSource.isL1Prefetch(r.replacerInfo.reqSource) && !r.hit
141152
)
142-
val l2prefetchLate = io.latePF.map(_.valid)
143-
// TODO: get difference prefetchSrc for detailed analysis
144-
// FIXME lyq: it's abnormal l2prefetchLate / l2prefetchUseful is more than 1
145153

146154
// PF Accuracy/Coverage/Late Accumulate/Rolling
147-
XSPerfAccumulate("l2prefetchSent", PopCount(l2prefetchSent))
148-
XSPerfAccumulate("l2prefetchUseful", PopCount(l2prefetchUseful))
149155
XSPerfAccumulate("l2demandMiss", PopCount(l2demandMiss))
150-
XSPerfAccumulate("l2prefetchLate", PopCount(l2prefetchLate))
151-
XSPerfRolling("L2PrefetchAccuracy", PopCount(l2prefetchUseful), PopCount(l2prefetchSent), 1000, io.debugTopDown.robTrueCommit, clock, reset)
152-
XSPerfRolling("L2PrefetchCoverage", PopCount(l2prefetchUseful), PopCount(l2prefetchUseful) + PopCount(l2demandMiss), 1000, io.debugTopDown.robTrueCommit, clock, reset)
153-
XSPerfRolling("L2PrefetchLate", PopCount(l2prefetchLate), PopCount(l2prefetchUseful), 1000, io.debugTopDown.robTrueCommit, clock, reset)
154-
for ((name, _, _, sent, useful, late) <- pfTypes zip l2prefetchSentVec zip l2prefetchUsefulVec zip l2prefetchLateVec map { case (((a, b), c), d) => (a._1, a._2, a._3, b, c, d) }) {
155-
XSPerfAccumulate(s"l2prefetchSent$name", PopCount(sent))
156-
XSPerfAccumulate(s"l2prefetchUseful$name", PopCount(useful))
157-
XSPerfAccumulate(s"l2prefetchLate$name", PopCount(late))
158-
XSPerfRolling(s"L2PrefetchAccuracy$name", PopCount(useful), PopCount(sent), 1000, io.debugTopDown.robTrueCommit, clock, reset)
159-
XSPerfRolling(s"L2PrefetchCoverage$name", PopCount(useful), PopCount(useful) + PopCount(l2demandMiss), 1000, io.debugTopDown.robTrueCommit, clock, reset)
156+
XSPerfAccumulate("l1prefetchMiss", PopCount(l1prefetchMiss))
157+
XSPerfAccumulate("l2prefetchMiss", PopCount(l2prefetchMiss))
158+
XSPerfAccumulate("l2prefetchSent", PopCount(l2pfSentVec.flatten))
159+
XSPerfAccumulate("l2prefetchSentToPipe", PopCount(l2pfSentToPipeVec.flatten))
160+
XSPerfAccumulate("l2prefetchHit", l2hitPfVec.reduce(_ + _))
161+
XSPerfAccumulate("l2prefetchHitInCache", PopCount(l2hitPfInCacheVec.flatten))
162+
XSPerfAccumulate("l2prefetchHitInMSHR", PopCount(l2hitPfInMSHRVec.flatten))
163+
XSPerfAccumulate("l2prefetchLate", l2pfLateVec.reduce(_ + _))
164+
XSPerfAccumulate("l2prefetchLateInCache", PopCount(l2pfLateInCache.flatten))
165+
XSPerfAccumulate("l2prefetchLateInMSHR", PopCount(l2pfLateInMSHR.flatten))
166+
XSPerfRolling("L2PrefetchAccuracy", l2hitPfVec.reduce(_ + _), PopCount(l2pfSentVec.flatten), 1000, io.debugTopDown.robTrueCommit, clock, reset)
167+
XSPerfRolling("L2PrefetchLate", l2pfLateVec.reduce(_ + _), PopCount(l2pfSentVec.flatten), 1000, io.debugTopDown.robTrueCommit, clock, reset)
168+
XSPerfRolling("L2PrefetchCoverage", l2hitPfVec.reduce(_ + _), l2hitPfVec.reduce(_ + _) + PopCount(l2demandMiss), 1000, io.debugTopDown.robTrueCommit, clock, reset)
169+
for ((x, i) <- pfTypes.zipWithIndex) {
170+
val name = x._1
171+
XSPerfAccumulate(s"l2prefetchSent$name", PopCount(l2pfSentVec(i)))
172+
XSPerfAccumulate(s"l2prefetchSentToPipe$name", PopCount(l2pfSentToPipeVec(i)))
173+
XSPerfAccumulate(s"l2prefetchHit$name", l2hitPfVec(i))
174+
XSPerfAccumulate(s"l2prefetchHitInCache$name", PopCount(l2hitPfInCacheVec(i)))
175+
XSPerfAccumulate(s"l2prefetchHitInMSHR$name", PopCount(l2hitPfInMSHRVec(i)))
176+
XSPerfAccumulate(s"l2prefetchLate$name", l2pfLateVec(i))
177+
XSPerfAccumulate(s"l2prefetchLateInCache$name", PopCount(l2pfLateInCache(i)))
178+
XSPerfAccumulate(s"l2prefetchLateInMSHR$name", PopCount(l2pfLateInMSHR(i)))
179+
XSPerfRolling(s"L2PrefetchAccuracy$name", l2hitPfVec(i), PopCount(l2pfSentVec(i)), 1000, io.debugTopDown.robTrueCommit, clock, reset)
180+
XSPerfRolling(s"L2PrefetchLate$name", l2pfLateVec(i), PopCount(l2pfSentVec(i)), 1000, io.debugTopDown.robTrueCommit, clock, reset)
181+
XSPerfRolling(s"L2PrefetchCoverage$name", l2hitPfVec(i), l2hitPfVec(i) + PopCount(l2demandMiss), 1000, io.debugTopDown.robTrueCommit, clock, reset)
160182
}
161183

162184
}

0 commit comments

Comments
 (0)