Skip to content
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
2833b84
mem: Refactor Prefetch Filter Implementation and Enhance Prefetching …
Llemonade Jan 7, 2026
56369e4
mem-cache: Add TrainFilter to prefetcher
Oct 29, 2025
fecf3ce
mem-cache: Align prefetch configuration with RTL implementation
XXtaoo Jan 7, 2026
8d94c48
mem: disable L2PFfilter when use PFbuffer mode
Llemonade Jan 9, 2026
63b7168
mem: add counters for XSCompositePrefetcher and XSStride
Llemonade Jan 13, 2026
8611553
mem: add Stride train trace archdb
Llemonade Jan 14, 2026
3f65286
mem: add util script for StrideTrainTrace analysis
Llemonade Jan 14, 2026
00d1b2a
mem: fix stride redundant table bug
Llemonade Jan 15, 2026
946a2ef
mem: support bop berti cmc spp opt opcp in PFbuffer mode
Llemonade Jan 15, 2026
43c3d3a
mem: add act stream support in PF_buffer mode
Llemonade Jan 16, 2026
5a20495
mem: enable PF_buffer mode in ideal_kmhv3 and kmhv2 configs
Llemonade Jan 16, 2026
13fa61a
mem: add prefetch_train parameter to control L1PF trigger L2Train
Llemonade Jan 23, 2026
6a285c9
mem: add statistics for removed full prefetch sources
Llemonade Jan 30, 2026
49c20e3
mem: prioritize despacitostream requests
Llemonade Jan 30, 2026
9842228
mem:change default queue_filter value to true
Llemonade Feb 3, 2026
e4a76f0
mem: update queueFilter condition in prefetch redundancy check
Llemonade Feb 4, 2026
de19790
mem: add despacito Trace arch-db support
Llemonade Feb 4, 2026
c06cd8f
mem : align bop hash
Llemonade Feb 5, 2026
58bfb1d
mem: disable prefetch buffers mode in ideal_kmhv3 configuration
Llemonade Mar 16, 2026
545617c
mem: clean code and add comment
Llemonade Mar 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions configs/common/Options.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,11 @@ def addCommonOptions(parser, configure_xiangshan=False):
help="""
Prefetching cache level for SMS'pht""")

parser.add_argument("--enable-pf-buffer", action="store_true", default=False,
help="""
Force all hardware prefetchers to enable their
optional prefetch buffer (QueuedPrefetcher.use_pf_buffer).""")

parser.add_argument("--cpu-clock", action="store", type=str,
default='3GHz',
help="Clock for blocks running at CPU speed")
Expand Down
21 changes: 20 additions & 1 deletion configs/common/PrefetcherConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ def create_prefetcher(cpu, cache_level, options):
prefetcher = _get_hwp(prefetcher_name)
print(f"create_prefetcher at {cache_level}: {prefetcher_name}")

if prefetcher != NULL and getattr(options, 'enable_pf_buffer', False):
if hasattr(prefetcher, 'use_pf_buffer'):
prefetcher.use_pf_buffer = True

if prefetcher == NULL:
return NULL

Expand Down Expand Up @@ -59,6 +63,10 @@ def create_prefetcher(cpu, cache_level, options):
prefetcher.enable_activepage = False
prefetcher.enable_pht = True
prefetcher.enable_xsstream = True
prefetcher.prefetch_train = False # disable L1PF train L2
# disable unecessary filter to align with RTL when in pf_buffer mode
if hasattr(prefetcher, 'queue_filter'):
prefetcher.queue_filter = False

if cache_level == 'l2':
if options.classic_l2:
Expand All @@ -72,6 +80,10 @@ def create_prefetcher(cpu, cache_level, options):
prefetcher.enable_despacito_stream = False
prefetcher.bop_large = XSVirtualLargeBOP(is_sub_prefetcher=True,enable_adaptoffset=False)
prefetcher.bop_small = XSPhysicalSmallBOP(is_sub_prefetcher=True,enable_adaptoffset=False)
prefetcher.prefetch_train = False # disable L1PF train L2
# disable unecessary filter to align with RTL when in pf_buffer mode
if hasattr(prefetcher, 'queue_filter'):
prefetcher.queue_filter = False
if options.l1_to_l2_pf_hint:
prefetcher.queue_size = 64
prefetcher.max_prefetch_requests_with_pending_translation = 128
Expand All @@ -88,10 +100,17 @@ def create_prefetcher(cpu, cache_level, options):
prefetcher.enable_bop = True
prefetcher.enable_cdp = False
prefetcher.enable_despacito_stream = False
if prefetcher.enable_despacito_stream:
# if you want to check despacito pattern trace, set this to True
prefetcher.despacito_stream.enable_despacito_db = False
prefetcher.bop_large = XSVirtualLargeBOP(is_sub_prefetcher=True,enable_adaptoffset=False)
prefetcher.bop_small = XSPhysicalSmallBOP(is_sub_prefetcher=True,enable_adaptoffset=False)
prefetcher.prefetch_train = False # disable L1PF train L2
# disable unecessary filter to align with RTL when in pf_buffer mode
if hasattr(prefetcher, 'queue_filter'):
prefetcher.queue_filter = False
if options.l1_to_l2_pf_hint:
prefetcher.queue_size = 64
prefetcher.queue_size = 32
prefetcher.max_prefetch_requests_with_pending_translation = 128

if cache_level == 'l3':
Expand Down
24 changes: 24 additions & 0 deletions configs/common/xiangshan.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,7 @@ def _finish_xiangshan_system(args, test_sys, TestCPUClass, ruby):
test_sys.arch_db.dump_l3_evict_trace = False
test_sys.arch_db.dump_l1_miss_trace = False
test_sys.arch_db.dump_bop_train_trace = False
test_sys.arch_db.dump_stride_train_trace = False
test_sys.arch_db.dump_sms_train_trace = False
test_sys.arch_db.dump_vaddr_trace = False
test_sys.arch_db.dump_lifetime = False
Expand Down Expand Up @@ -780,6 +781,29 @@ def _finish_xiangshan_system(args, test_sys, TestCPUClass, ruby):
"Conf INT NOT NULL," \
"Miss BOOL NOT NULL," \
"SITE TEXT);"
,
"CREATE TABLE StrideTrainTrace(" \
"ID INTEGER PRIMARY KEY AUTOINCREMENT," \
"Tick INT NOT NULL," \
"Addr INT NOT NULL," \
"PC INT NOT NULL," \
"HashPC INT NOT NULL," \
"QueryHit BOOL NOT NULL," \
"IsFirstShot BOOL NOT NULL," \
"Miss BOOL NOT NULL," \
"IsTrain BOOL NOT NULL," \
"SITE TEXT);"
,
"CREATE TABLE DespacitoTrainTrace(" \
"ID INTEGER PRIMARY KEY AUTOINCREMENT," \
"Tick INT NOT NULL," \
"vAddr INT NOT NULL," \
"pAddr INT NOT NULL," \
"PC INT NOT NULL," \
"hasPC BOOL NOT NULL," \
"Miss BOOL NOT NULL," \
"IsTrain BOOL NOT NULL," \
"SITE TEXT);"
,# perfCounter CommitTrace
perfCCT_cmd
]
Expand Down
3 changes: 2 additions & 1 deletion configs/example/idealkmhv3.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ def setKmhV3IdealParams(args, system):
# If user didn't specify bp_type, set default based on ideal_kmhv3
args.bp_type = 'DecoupledBPUWithBTB'
args.l2_size = '2MB'

# Enable prefetch buffers for all hardware prefetchers in this config.
args.enable_pf_buffer = False
# Match the memories with the CPUs, based on the options for the test system
TestMemClass = Simulation.setMemClass(args)

Expand Down
3 changes: 2 additions & 1 deletion configs/example/kmhv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
# disable l1 berti, l2 cdp
args.l2_wrapper_hwp_type = "L2CompositeWithWorkerPrefetcher"
args.kmh_align = True

# Enable prefetch buffers for all hardware prefetchers in this config.
args.enable_pf_buffer = True
assert not args.external_memory_system

test_mem_mode = 'timing'
Expand Down
3 changes: 3 additions & 0 deletions configs/example/kmhv3.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,9 @@ def setKmhV3Params(args, system):

assert not args.external_memory_system

# Enable prefetch buffers for all hardware prefetchers in this config.
args.enable_pf_buffer = True

# Set default bp_type based on ideal_kmhv3 flag
# If user didn't specify bp_type, set default based on ideal_kmhv3
args.bp_type = 'DecoupledBPUWithBTB'
Expand Down
82 changes: 82 additions & 0 deletions docs/prefetch_cache_partition_plan.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# 预取专用Cache分区计划

## 目标
为 Cache 增加一个只存放预取行的分区。预取的插入/替换/查询仅在该分区内进行,普通需求行不得占用。需求命中应能正常读取,是否迁移需定义。

## 现有关键位置
- 访问/命中/未命中与填充逻辑位于 [src/mem/cache/base.cc](src/mem/cache/base.cc)。
- Cache 前端类在 [src/mem/cache/cache.hh](src/mem/cache/cache.hh) 及实现 [src/mem/cache/cache.cc](src/mem/cache/cache.cc)。
- 标签与组相联实现位于 [src/mem/cache/tags/base.hh](src/mem/cache/tags/base.hh)、[src/mem/cache/tags/base_set_assoc.hh](src/mem/cache/tags/base_set_assoc.hh)、[src/mem/cache/tags/base_set_assoc.cc](src/mem/cache/tags/base_set_assoc.cc)。
- 预取元数据在填充时写入,逻辑在 base.cc 的填充路径。

## 待确认设计点
1) 分区形态:每组预留若干路 vs 独立标签存储,倾向每组预留路以便接入现有索引。
2) 需求命中是否迁移到主分区,或仅标记“已需求访问”并留在预取分区。
3) 替换策略:预取分区独立策略实例,或复用同一策略但作用于子集路。
4) 容量参数:按路数/字节/比例暴露,支持 0 关闭。
5) 预取分区驱逐与主分区在一致性/写回语义上是否完全相同。

## 工作步骤
1) **参数面**:新增 Cache 参数(开关与分区大小),配置层与 SCons 暴露。
2) **块元数据**:扩展 `CacheBlk` 标记分区驻留与是否被需求提升,覆盖序列化/重置。
3) **标签/组相联分区**:在 BaseSetAssoc(或派生)维护主/预取两套替换池;查找覆盖两分区;按分区选牺牲块。
4) **查找路径**:`BaseCache::access` 同查两分区;命中预取分区时的处理与统计拆分。
5) **填充/分配路由**:`handleFill`/`allocateBlock` 将预取响应路由至预取分区,禁止非预取分配进入;分区满时仅内部驱逐。
6) **提升策略**:需求命中预取分区时决定“迁移到主分区”或“原地标记需求”,并更新替换元数据。
7) **驱逐/写回路径**:`evictBlock`、`doWritebacks`、CleanEvict 保持分区隔离但语义一致,统计独立。
8) **预取队列**:确保 MSHR 分配对预取请求打标,便于填充分路由识别;必要时增加断言/计数。
9) **统计/探针**:预取分区占用、命中、填充、驱逐、提升、DOA 等计数,必要时调整暖身处理。
10) **配置与文档**:更新 Cache.py/PrefetcherConfig.py 等配置,补充用户文档与调优说明。
11) **测试**:微基准覆盖仅预取、混合需求+预取、启用分区的 checkpoint/恢复、一致性与失效场景。

## 风险/关注点
- 替换策略需支持双池。
- 分区化可能影响延迟建模(tag/data 端口计数)。
- 序列化需保留分区与替换状态。
- 默认关闭以确保兼容性。

## 讨论更新(方案锁定后续步骤)
基于已确认的设计选项:
1) 分区形态:每组预留若干路。
2) 需求命中仅标记“已需求访问”并留在预取分区,不迁移。
3) 替换策略:复用同一替换策略实例,但作用于分区内子集路。
4) 容量参数:按比例暴露,支持 0 关闭。
5) 驱逐语义:预取分区与主分区在一致性/写回语义上完全相同。

### 下一步工作规划
1) **参数落地**:新增比例型分区参数与开关;Python 配置与 SCons 接口同步;默认 0 关闭。
2) **块元数据**:为 `CacheBlk` 增加分区驻留标记与“已需求访问”标志;补齐序列化/重置逻辑。
3) **分区化标签/替换**:在 BaseSetAssoc(或派生)实现按组预留路的双分区池,复用同一替换策略实例但限制候选路集;提供按分区选 victim 的接口。
4) **访问路径**:`BaseCache::access` 查两分区;命中预取分区时仅标记“已需求访问”,不迁移;统计分拆。
5) **填充/分配路由**:`handleFill`/`allocateBlock` 将预取响应放入预取分区;普通需求不得占用;分区满时仅内部驱逐,语义与主分区一致。
6) **驱逐与写回**:复用现有写回/一致性流程,但保持分区隔离与统计独立;确保 CleanEvict/Writeback 行为一致。
7) **预取标记链路**:确保 MSHR/请求链路对预取打标,便于填充分区路由;缺标时记录计数/告警。
8) **统计**:新增占用、命中、填充、驱逐、已需求访问计数(含 DOA),按分区拆分;若使用比例参数,记录实际预留路数。
9) **测试计划**:
- 仅预取流:验证预取分区被使用且需求不会进入。
- 混合流:需求命中预取分区仅标记不迁移;容量隔离有效。
- 恢复/一致性:checkpoint/restore,CleanEvict/Writeback 语义一致性。
- 关闭模式:参数为 0 时行为与原先一致。

## 讨论更新(追加)
新的设计点修正:
1) 分区形态:每组预留若干路,按比例参数计算预留路数;向下取整;若不足 1 路则该组预留 0(等价于禁用)。
2) 需求命中:仅标记“已需求访问”,不迁移出预取分区。
3) 替换策略:预取分区使用独立的替换策略实例(与主分区解耦)。
4) 容量参数:按比例暴露,支持 0 关闭;组数若非幂次则报错(仅在参数为幂次组数时工作)。
5) 驱逐语义:预取分区与主分区在一致性/写回上完全一致。

### 下一步工作规划(根据新设计点)
1) **参数实现**:比例型分区参数,向下截断到整路;不足 1 路取 0;若组数非幂次直接报错;默认 0 关闭。配置层与 SCons 同步。
2) **块元数据**:`CacheBlk` 增加分区驻留与“已需求访问”标志,含序列化/重置。
3) **分区化标签/替换**:BaseSetAssoc(或派生)维护主/预取双池,预取池使用独立替换策略实例;按组预留路数由比例计算;victim 选择限定在各自池内。
4) **访问路径**:`BaseCache::access` 查两分区;命中预取分区仅标记已需求访问,不迁移;统计分拆。
5) **填充/分配路由**:`handleFill`/`allocateBlock` 将预取响应放入预取分区,普通需求不得占用;预取分区满时仅内部驱逐,语义与主分区一致。
6) **驱逐与写回**:复用现有流程,保持分区隔离与统计独立,保证 CleanEvict/Writeback 语义一致。
7) **预取标记链路**:MSHR/请求需带预取标记,缺标计数/告警,确保填充分路由正确。
8) **统计**:分区占用、命中、填充、驱逐、已需求访问、DOA;记录比例参数实际转换的预留路数。
9) **测试**:
- 仅预取流:验证预取分区启用与容量隔离。
- 混合流:需求命中不迁移,统计正确。
- 非幂次组数:应触发报错覆盖。
- 关闭模式:参数为 0 时回归旧行为。
2 changes: 1 addition & 1 deletion src/cpu/o3/dyn_inst.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ namespace o3
DynInst::DynInst(const Arrays &arrays, const StaticInstPtr &static_inst,
const StaticInstPtr &_macroop, InstSeqNum seq_num, CPU *_cpu)
: seqNum(seq_num), staticInst(static_inst),
xsMeta(new XsDynInstMeta()),
xsMeta(new XsDynInstMeta(seq_num)),
cpu(_cpu),
_numSrcs(arrays.numSrcs), _numDests(arrays.numDests),
_flatDestIdx(arrays.flatDestIdx), _destIdx(arrays.destIdx),
Expand Down
12 changes: 7 additions & 5 deletions src/cpu/o3/dyn_inst_xsmeta.hh
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,14 @@ namespace o3

class XsDynInstMeta : public RefCounted
{
public:
bool squashed;
Addr instAddr;
public:
bool squashed;
Addr instAddr;
InstSeqNum seqNum;

public:
XsDynInstMeta(): squashed(false),instAddr(0) {}
public:
XsDynInstMeta(): squashed(false), instAddr(0), seqNum(0) {}
XsDynInstMeta(InstSeqNum seq): squashed(false), instAddr(0), seqNum(seq) {}
};

using XsDynInstMetaPtr = RefCountingPtr<XsDynInstMeta>;
Expand Down
13 changes: 13 additions & 0 deletions src/mem/cache/base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,13 @@ BaseCache::handleTimingReqMiss(PacketPtr pkt, MSHR *mshr, CacheBlk *blk,
pkt->pfSource = mshr->getPFSource();
pkt->pfDepth = mshr->getPFDepth();

// Demand request merging into prefetch-only MSHR
if (pkt->isDemand()) {
stats.demandMergedIntoPfMSHR++;
DPRINTF(Cache, "Demand request %#lx merged into prefetch MSHR\n",
pkt->getAddr());
}

} else if (mshr->hasFromCPU()) {
// no pkt in mshr originated from cache; all of them are from cpu
pkt->coalescingMSHR = true;
Expand Down Expand Up @@ -2887,6 +2894,12 @@ BaseCache::CacheStats::CacheStats(BaseCache &c)
"number of squashed dead block replacements"),
ADD_STAT(squashedLiveBlockReplacements, statistics::units::Count::get(),
"number of squashed live block replacements"),
ADD_STAT(pfMergedWithDemand, statistics::units::Count::get(),
"number of MSHR completions where prefetch was merged with demand"),
ADD_STAT(pfOnlyFill, statistics::units::Count::get(),
"number of MSHR completions with only prefetch (no demand merge)"),
ADD_STAT(demandMergedIntoPfMSHR, statistics::units::Count::get(),
"number of demand requests that merged into prefetch MSHR"),
ADD_STAT(squashedDemandHits, statistics::units::Count::get(),
"number of squashed inst block demand hits"),
ADD_STAT(loadTagReadFails, statistics::units::Count::get(),
Expand Down
7 changes: 7 additions & 0 deletions src/mem/cache/base.hh
Original file line number Diff line number Diff line change
Expand Up @@ -1313,6 +1313,13 @@ class BaseCache : public ClockedObject, public CacheAccessor
/** Number of replacements of blocks from squashed inst but reused. */
statistics::Scalar squashedLiveBlockReplacements;

/** Number of MSHR completions where prefetch was merged with demand */
statistics::Scalar pfMergedWithDemand;
/** Number of MSHR completions with only prefetch (no demand merge) */
statistics::Scalar pfOnlyFill;
/** Number of demand requests that merged into prefetch MSHR */
statistics::Scalar demandMergedIntoPfMSHR;

/** Number of demand hits that accessed squashed inst blocks. */
statistics::Scalar squashedDemandHits;

Expand Down
6 changes: 6 additions & 0 deletions src/mem/cache/cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,12 @@ Cache::serviceMSHRTargets(MSHR *mshr, const PacketPtr pkt, CacheBlk *blk)
blk->setPrefetched();
blk->setXsMetadata(pkt->req->getXsMetadata());
DPRINTF(Cache, "Marking block as prefetched from prefetcher %i\n", blk->getXsMetadata().prefetchSource);
stats.pfOnlyFill++; // Pure prefetch fill (no demand merge)
} else if (blk && from_core && from_pref) {
// Prefetch was merged with demand - won't be marked as prefetched
stats.pfMergedWithDemand++;
DPRINTF(Cache, "Prefetch merged with demand for %#lx - not marking as prefetched\n",
blk->getTag());
}

if (!mshr->hasLockedRMWReadTarget()) {
Expand Down
Loading
Loading