OpenXiangShan · Llemonade · Jan 7, 2026 · Oct 29, 2025 · Jan 7, 2026 · Jan 9, 2026
diff --git a/configs/common/Options.py b/configs/common/Options.py
@@ -306,6 +306,11 @@ def addCommonOptions(parser, configure_xiangshan=False):
                         help="""
                         Prefetching cache level for SMS'pht""")
 
+    parser.add_argument("--enable-pf-buffer", action="store_true", default=False,
+                        help="""
+                        Force all hardware prefetchers to enable their
+                        optional prefetch buffer (QueuedPrefetcher.use_pf_buffer).""")
+
     parser.add_argument("--cpu-clock", action="store", type=str,
                         default='3GHz',
                         help="Clock for blocks running at CPU speed")

diff --git a/configs/common/PrefetcherConfig.py b/configs/common/PrefetcherConfig.py
@@ -20,6 +20,10 @@ def create_prefetcher(cpu, cache_level, options):
         prefetcher = _get_hwp(prefetcher_name)
         print(f"create_prefetcher at {cache_level}: {prefetcher_name}")
 
+    if prefetcher != NULL and getattr(options, 'enable_pf_buffer', False):
+        if hasattr(prefetcher, 'use_pf_buffer'):
+            prefetcher.use_pf_buffer = True
+
     if prefetcher == NULL:
         return NULL
 
@@ -59,6 +63,10 @@ def create_prefetcher(cpu, cache_level, options):
             prefetcher.enable_activepage = False
             prefetcher.enable_pht = True
             prefetcher.enable_xsstream = True
+            prefetcher.prefetch_train = False # disable L1PF train L2
+            # disable unecessary filter to align with RTL when in pf_buffer mode
+            if hasattr(prefetcher, 'queue_filter'):
+                prefetcher.queue_filter = False
 
     if cache_level == 'l2':
         if options.classic_l2:
@@ -72,6 +80,10 @@ def create_prefetcher(cpu, cache_level, options):
                 prefetcher.enable_despacito_stream = False
                 prefetcher.bop_large = XSVirtualLargeBOP(is_sub_prefetcher=True,enable_adaptoffset=False)
                 prefetcher.bop_small = XSPhysicalSmallBOP(is_sub_prefetcher=True,enable_adaptoffset=False)
+                prefetcher.prefetch_train = False # disable L1PF train L2
+                # disable unecessary filter to align with RTL when in pf_buffer mode
+                if hasattr(prefetcher, 'queue_filter'):
+                    prefetcher.queue_filter = False
             if options.l1_to_l2_pf_hint:
                 prefetcher.queue_size = 64
                 prefetcher.max_prefetch_requests_with_pending_translation = 128
@@ -88,10 +100,17 @@ def create_prefetcher(cpu, cache_level, options):
                 prefetcher.enable_bop = True
                 prefetcher.enable_cdp = False
                 prefetcher.enable_despacito_stream = False
+                if prefetcher.enable_despacito_stream:
+                    # if you want to check despacito pattern trace, set this to True
+                    prefetcher.despacito_stream.enable_despacito_db = False
                 prefetcher.bop_large = XSVirtualLargeBOP(is_sub_prefetcher=True,enable_adaptoffset=False)
                 prefetcher.bop_small = XSPhysicalSmallBOP(is_sub_prefetcher=True,enable_adaptoffset=False)
+                prefetcher.prefetch_train = False # disable L1PF train L2
+                # disable unecessary filter to align with RTL when in pf_buffer mode
+                if hasattr(prefetcher, 'queue_filter'):
+                    prefetcher.queue_filter = False
             if options.l1_to_l2_pf_hint:
-                prefetcher.queue_size = 64
+                prefetcher.queue_size = 32
                 prefetcher.max_prefetch_requests_with_pending_translation = 128
 
     if cache_level == 'l3':

diff --git a/configs/common/xiangshan.py b/configs/common/xiangshan.py
@@ -708,6 +708,7 @@ def _finish_xiangshan_system(args, test_sys, TestCPUClass, ruby):
         test_sys.arch_db.dump_l3_evict_trace = False
         test_sys.arch_db.dump_l1_miss_trace = False
         test_sys.arch_db.dump_bop_train_trace = False
+        test_sys.arch_db.dump_stride_train_trace = False
         test_sys.arch_db.dump_sms_train_trace = False
         test_sys.arch_db.dump_vaddr_trace = False
         test_sys.arch_db.dump_lifetime = False
@@ -780,6 +781,29 @@ def _finish_xiangshan_system(args, test_sys, TestCPUClass, ruby):
             "Conf INT NOT NULL," \
             "Miss BOOL NOT NULL," \
             "SITE TEXT);"
+            ,
+            "CREATE TABLE StrideTrainTrace(" \
+            "ID INTEGER PRIMARY KEY AUTOINCREMENT," \
+            "Tick INT NOT NULL," \
+            "Addr INT NOT NULL," \
+            "PC INT NOT NULL," \
+            "HashPC INT NOT NULL," \
+            "QueryHit BOOL NOT NULL," \
+            "IsFirstShot BOOL NOT NULL," \
+            "Miss BOOL NOT NULL," \
+            "IsTrain BOOL NOT NULL," \
+            "SITE TEXT);"
+            ,
+            "CREATE TABLE DespacitoTrainTrace(" \
+            "ID INTEGER PRIMARY KEY AUTOINCREMENT," \
+            "Tick INT NOT NULL," \
+            "vAddr INT NOT NULL," \
+            "pAddr INT NOT NULL," \
+            "PC INT NOT NULL," \
+            "hasPC BOOL NOT NULL," \
+            "Miss BOOL NOT NULL," \
+            "IsTrain BOOL NOT NULL," \
+            "SITE TEXT);"
             ,# perfCounter CommitTrace
             perfCCT_cmd
         ]

diff --git a/configs/example/idealkmhv3.py b/configs/example/idealkmhv3.py
@@ -138,7 +138,8 @@ def setKmhV3IdealParams(args, system):
     # If user didn't specify bp_type, set default based on ideal_kmhv3
     args.bp_type = 'DecoupledBPUWithBTB'
     args.l2_size = '2MB'
-
+    # Enable prefetch buffers for all hardware prefetchers in this config.
+    args.enable_pf_buffer = False
     # Match the memories with the CPUs, based on the options for the test system
     TestMemClass = Simulation.setMemClass(args)
 

diff --git a/configs/example/kmhv2.py b/configs/example/kmhv2.py
@@ -30,7 +30,8 @@
     # disable l1 berti, l2 cdp
     args.l2_wrapper_hwp_type = "L2CompositeWithWorkerPrefetcher"
     args.kmh_align = True
-
+    # Enable prefetch buffers for all hardware prefetchers in this config.
+    args.enable_pf_buffer = True
     assert not args.external_memory_system
 
     test_mem_mode = 'timing'

diff --git a/configs/example/kmhv3.py b/configs/example/kmhv3.py
@@ -182,6 +182,9 @@ def setKmhV3Params(args, system):
 
     assert not args.external_memory_system
 
+    # Enable prefetch buffers for all hardware prefetchers in this config.
+    args.enable_pf_buffer = True
+
     # Set default bp_type based on ideal_kmhv3 flag
     # If user didn't specify bp_type, set default based on ideal_kmhv3
     args.bp_type = 'DecoupledBPUWithBTB'

diff --git a/docs/prefetch_cache_partition_plan.md b/docs/prefetch_cache_partition_plan.md
@@ -0,0 +1,82 @@
+# 预取专用Cache分区计划
+
+## 目标
+为 Cache 增加一个只存放预取行的分区。预取的插入/替换/查询仅在该分区内进行，普通需求行不得占用。需求命中应能正常读取，是否迁移需定义。
+
+## 现有关键位置
+- 访问/命中/未命中与填充逻辑位于 [src/mem/cache/base.cc](src/mem/cache/base.cc)。
+- Cache 前端类在 [src/mem/cache/cache.hh](src/mem/cache/cache.hh) 及实现 [src/mem/cache/cache.cc](src/mem/cache/cache.cc)。
+- 标签与组相联实现位于 [src/mem/cache/tags/base.hh](src/mem/cache/tags/base.hh)、[src/mem/cache/tags/base_set_assoc.hh](src/mem/cache/tags/base_set_assoc.hh)、[src/mem/cache/tags/base_set_assoc.cc](src/mem/cache/tags/base_set_assoc.cc)。
+- 预取元数据在填充时写入，逻辑在 base.cc 的填充路径。
+
+## 待确认设计点
+1) 分区形态：每组预留若干路 vs 独立标签存储，倾向每组预留路以便接入现有索引。
+2) 需求命中是否迁移到主分区，或仅标记“已需求访问”并留在预取分区。
+3) 替换策略：预取分区独立策略实例，或复用同一策略但作用于子集路。
+4) 容量参数：按路数/字节/比例暴露，支持 0 关闭。
+5) 预取分区驱逐与主分区在一致性/写回语义上是否完全相同。
+
+## 工作步骤
+1) **参数面**：新增 Cache 参数（开关与分区大小），配置层与 SCons 暴露。
+2) **块元数据**：扩展 `CacheBlk` 标记分区驻留与是否被需求提升，覆盖序列化/重置。
+3) **标签/组相联分区**：在 BaseSetAssoc（或派生）维护主/预取两套替换池；查找覆盖两分区；按分区选牺牲块。
+4) **查找路径**：`BaseCache::access` 同查两分区；命中预取分区时的处理与统计拆分。
+5) **填充/分配路由**：`handleFill`/`allocateBlock` 将预取响应路由至预取分区，禁止非预取分配进入；分区满时仅内部驱逐。
+6) **提升策略**：需求命中预取分区时决定“迁移到主分区”或“原地标记需求”，并更新替换元数据。
+7) **驱逐/写回路径**：`evictBlock`、`doWritebacks`、CleanEvict 保持分区隔离但语义一致，统计独立。
+8) **预取队列**：确保 MSHR 分配对预取请求打标，便于填充分路由识别；必要时增加断言/计数。
+9) **统计/探针**：预取分区占用、命中、填充、驱逐、提升、DOA 等计数，必要时调整暖身处理。
+10) **配置与文档**：更新 Cache.py/PrefetcherConfig.py 等配置，补充用户文档与调优说明。
+11) **测试**：微基准覆盖仅预取、混合需求+预取、启用分区的 checkpoint/恢复、一致性与失效场景。
+
+## 风险/关注点
+- 替换策略需支持双池。
+- 分区化可能影响延迟建模（tag/data 端口计数）。
+- 序列化需保留分区与替换状态。
+- 默认关闭以确保兼容性。
+
+## 讨论更新（方案锁定后续步骤）
+基于已确认的设计选项：
+1) 分区形态：每组预留若干路。
+2) 需求命中仅标记“已需求访问”并留在预取分区，不迁移。
+3) 替换策略：复用同一替换策略实例，但作用于分区内子集路。
+4) 容量参数：按比例暴露，支持 0 关闭。
+5) 驱逐语义：预取分区与主分区在一致性/写回语义上完全相同。
+
+### 下一步工作规划
+1) **参数落地**：新增比例型分区参数与开关；Python 配置与 SCons 接口同步；默认 0 关闭。
+2) **块元数据**：为 `CacheBlk` 增加分区驻留标记与“已需求访问”标志；补齐序列化/重置逻辑。
+3) **分区化标签/替换**：在 BaseSetAssoc（或派生）实现按组预留路的双分区池，复用同一替换策略实例但限制候选路集；提供按分区选 victim 的接口。
+4) **访问路径**：`BaseCache::access` 查两分区；命中预取分区时仅标记“已需求访问”，不迁移；统计分拆。
+5) **填充/分配路由**：`handleFill`/`allocateBlock` 将预取响应放入预取分区；普通需求不得占用；分区满时仅内部驱逐，语义与主分区一致。
+6) **驱逐与写回**：复用现有写回/一致性流程，但保持分区隔离与统计独立；确保 CleanEvict/Writeback 行为一致。
+7) **预取标记链路**：确保 MSHR/请求链路对预取打标，便于填充分区路由；缺标时记录计数/告警。
+8) **统计**：新增占用、命中、填充、驱逐、已需求访问计数（含 DOA），按分区拆分；若使用比例参数，记录实际预留路数。
+9) **测试计划**：
+	- 仅预取流：验证预取分区被使用且需求不会进入。
+	- 混合流：需求命中预取分区仅标记不迁移；容量隔离有效。
+	- 恢复/一致性：checkpoint/restore，CleanEvict/Writeback 语义一致性。
+	- 关闭模式：参数为 0 时行为与原先一致。
+
+	## 讨论更新（追加）
+	新的设计点修正：
+	1) 分区形态：每组预留若干路，按比例参数计算预留路数；向下取整；若不足 1 路则该组预留 0（等价于禁用）。
+	2) 需求命中：仅标记“已需求访问”，不迁移出预取分区。
+	3) 替换策略：预取分区使用独立的替换策略实例（与主分区解耦）。
+	4) 容量参数：按比例暴露，支持 0 关闭；组数若非幂次则报错（仅在参数为幂次组数时工作）。
+	5) 驱逐语义：预取分区与主分区在一致性/写回上完全一致。
+
+	### 下一步工作规划（根据新设计点）
+	1) **参数实现**：比例型分区参数，向下截断到整路；不足 1 路取 0；若组数非幂次直接报错；默认 0 关闭。配置层与 SCons 同步。
+	2) **块元数据**：`CacheBlk` 增加分区驻留与“已需求访问”标志，含序列化/重置。
+	3) **分区化标签/替换**：BaseSetAssoc（或派生）维护主/预取双池，预取池使用独立替换策略实例；按组预留路数由比例计算；victim 选择限定在各自池内。
+	4) **访问路径**：`BaseCache::access` 查两分区；命中预取分区仅标记已需求访问，不迁移；统计分拆。
+	5) **填充/分配路由**：`handleFill`/`allocateBlock` 将预取响应放入预取分区，普通需求不得占用；预取分区满时仅内部驱逐，语义与主分区一致。
+	6) **驱逐与写回**：复用现有流程，保持分区隔离与统计独立，保证 CleanEvict/Writeback 语义一致。
+	7) **预取标记链路**：MSHR/请求需带预取标记，缺标计数/告警，确保填充分路由正确。
+	8) **统计**：分区占用、命中、填充、驱逐、已需求访问、DOA；记录比例参数实际转换的预留路数。
+	9) **测试**：
+		 - 仅预取流：验证预取分区启用与容量隔离。
+		 - 混合流：需求命中不迁移，统计正确。
+		 - 非幂次组数：应触发报错覆盖。
+		 - 关闭模式：参数为 0 时回归旧行为。
diff --git a/src/cpu/o3/dyn_inst.cc b/src/cpu/o3/dyn_inst.cc
@@ -63,7 +63,7 @@ namespace o3
 DynInst::DynInst(const Arrays &arrays, const StaticInstPtr &static_inst,
         const StaticInstPtr &_macroop, InstSeqNum seq_num, CPU *_cpu)
     : seqNum(seq_num), staticInst(static_inst),
-      xsMeta(new XsDynInstMeta()),
+      xsMeta(new XsDynInstMeta(seq_num)),
       cpu(_cpu),
       _numSrcs(arrays.numSrcs), _numDests(arrays.numDests),
       _flatDestIdx(arrays.flatDestIdx), _destIdx(arrays.destIdx),

diff --git a/src/cpu/o3/dyn_inst_xsmeta.hh b/src/cpu/o3/dyn_inst_xsmeta.hh
@@ -59,12 +59,14 @@ namespace o3
 
 class XsDynInstMeta : public RefCounted
 {
-public:
-        bool squashed;
-        Addr instAddr;
+  public:
+    bool squashed;
+    Addr instAddr;
+    InstSeqNum seqNum;
 
-public:
-    XsDynInstMeta(): squashed(false),instAddr(0) {}
+  public:
+    XsDynInstMeta(): squashed(false), instAddr(0), seqNum(0) {}
+    XsDynInstMeta(InstSeqNum seq): squashed(false), instAddr(0), seqNum(seq) {}
 };
 
 using XsDynInstMetaPtr = RefCountingPtr<XsDynInstMeta>;

diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
@@ -514,6 +514,13 @@ BaseCache::handleTimingReqMiss(PacketPtr pkt, MSHR *mshr, CacheBlk *blk,
                     pkt->pfSource = mshr->getPFSource();
                     pkt->pfDepth = mshr->getPFDepth();
 
+                    // Demand request merging into prefetch-only MSHR
+                    if (pkt->isDemand()) {
+                        stats.demandMergedIntoPfMSHR++;
+                        DPRINTF(Cache, "Demand request %#lx merged into prefetch MSHR\n",
+                                pkt->getAddr());
+                    }
+
                 } else if (mshr->hasFromCPU()) {
                     // no pkt in mshr originated from cache; all of them are from cpu
                     pkt->coalescingMSHR = true;
@@ -2887,6 +2894,12 @@ BaseCache::CacheStats::CacheStats(BaseCache &c)
              "number of squashed dead block replacements"),
     ADD_STAT(squashedLiveBlockReplacements, statistics::units::Count::get(),
                 "number of squashed live block replacements"),
+    ADD_STAT(pfMergedWithDemand, statistics::units::Count::get(),
+             "number of MSHR completions where prefetch was merged with demand"),
+    ADD_STAT(pfOnlyFill, statistics::units::Count::get(),
+             "number of MSHR completions with only prefetch (no demand merge)"),
+    ADD_STAT(demandMergedIntoPfMSHR, statistics::units::Count::get(),
+             "number of demand requests that merged into prefetch MSHR"),
     ADD_STAT(squashedDemandHits, statistics::units::Count::get(),
              "number of squashed inst block demand hits"),
     ADD_STAT(loadTagReadFails, statistics::units::Count::get(),

diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
@@ -1313,6 +1313,13 @@ class BaseCache : public ClockedObject, public CacheAccessor
         /** Number of replacements of blocks from squashed inst but reused. */
         statistics::Scalar squashedLiveBlockReplacements;
 
+        /** Number of MSHR completions where prefetch was merged with demand */
+        statistics::Scalar pfMergedWithDemand;
+        /** Number of MSHR completions with only prefetch (no demand merge) */
+        statistics::Scalar pfOnlyFill;
+        /** Number of demand requests that merged into prefetch MSHR */
+        statistics::Scalar demandMergedIntoPfMSHR;
+
         /** Number of demand hits that accessed squashed inst blocks. */
         statistics::Scalar squashedDemandHits;
 

diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
@@ -988,6 +988,12 @@ Cache::serviceMSHRTargets(MSHR *mshr, const PacketPtr pkt, CacheBlk *blk)
         blk->setPrefetched();
         blk->setXsMetadata(pkt->req->getXsMetadata());
         DPRINTF(Cache, "Marking block as prefetched from prefetcher %i\n", blk->getXsMetadata().prefetchSource);
+        stats.pfOnlyFill++;  // Pure prefetch fill (no demand merge)
+    } else if (blk && from_core && from_pref) {
+        // Prefetch was merged with demand - won't be marked as prefetched
+        stats.pfMergedWithDemand++;
+        DPRINTF(Cache, "Prefetch merged with demand for %#lx - not marking as prefetched\n",
+                blk->getTag());
     }
 
     if (!mshr->hasLockedRMWReadTarget()) {