Skip to content

Commit c51e30b

Browse files
committed
Fix GPU deadlock caused by m_next_global bottleneck in adaptive cache bypass
Root cause: - Original implementation checked prediction_table at response stage - When multiple requests with high miss-rate returned, only one could enter m_next_global (single slot), others got stuck in response_fifo - This caused response_fifo to fill up, blocking interconnect and causing deadlock Solution: - Move prediction-based bypass decision from response stage to request stage - Check prediction_table in memory_cycle() before sending request to L1D - Requests with prediction_table[pc] >= 8 now bypass L1D entirely - Response handling uses normal global memory path with adequate buffering Benefits: - Eliminates m_next_global bottleneck - Avoids unnecessary L1D accesses for predicted-miss requests - Aligns with adaptive bypass paper's design intent - Maintains prediction_table update logic
1 parent fdf6a54 commit c51e30b

File tree

1 file changed

+18
-7
lines changed

1 file changed

+18
-7
lines changed

src/gpgpu-sim/shader.cc

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2277,6 +2277,18 @@ bool ldst_unit::memory_cycle(warp_inst_t &inst,
22772277
// skip L1 cache if the option is enabled
22782278
if (m_core->get_config()->gmem_skip_L1D && (CACHE_L1 != inst.cache_op))
22792279
bypassL1D = true;
2280+
2281+
// cwpeng: adaptive bypass based on prediction table
2282+
// Check prediction table BEFORE sending request, not after response returns
2283+
if (!inst.is_store() && m_L1D != NULL) { // Only for global read
2284+
uint8_t hashed_pc = (uint8_t)inst.pc;
2285+
if (m_L1D->prediction_table[hashed_pc] >= 8) {
2286+
bypassL1D = true;
2287+
// Uncomment for debugging:
2288+
// printf("Bypass L1D on request (predict high miss rate) pc:%u, pred:%u\n",
2289+
// hashed_pc, m_L1D->prediction_table[hashed_pc]);
2290+
}
2291+
}
22802292
}
22812293
if (bypassL1D) {
22822294
// bypass L1 cache
@@ -2869,20 +2881,19 @@ void ldst_unit::cycle() {
28692881
// on load miss only
28702882

28712883
bool bypassL1D = false;
2872-
uint8_t temp_pc = 0; //cwpeng
2873-
address_type currPC = mf->get_pc();
2874-
temp_pc = (currPC == -1) ? (uint8_t) mf->get_original_mf()->get_pc() : (uint8_t) currPC;
2884+
// cwpeng: Removed prediction-based bypass from response stage
2885+
// Bypass decision is now made at request stage (in memory_cycle)
2886+
// to avoid m_next_global bottleneck and deadlock issues
28752887

28762888
if (CACHE_GLOBAL == mf->get_inst().cache_op || (m_L1D == NULL)) {
28772889
bypassL1D = true;
28782890
} else if (mf->get_access_type() == GLOBAL_ACC_R ||
28792891
mf->get_access_type() ==
28802892
GLOBAL_ACC_W) { // global memory access
28812893
if (m_core->get_config()->gmem_skip_L1D) bypassL1D = true;
2882-
if (m_L1D->prediction_table[temp_pc] >= 8 && mf->get_access_type() == GLOBAL_ACC_R){
2883-
bypassL1D = true;
2884-
printf("Bypass L1D due to high miss rate prediction pc:%u, pred:%u\n", temp_pc, m_L1D->prediction_table[temp_pc]);
2885-
}
2894+
// REMOVED: prediction-based bypass logic from here
2895+
// The bypass decision based on prediction_table is now made earlier
2896+
// in memory_cycle() before the request is sent to L1D/L2
28862897
}
28872898
if (bypassL1D) {
28882899
if (m_next_global == NULL) {

0 commit comments

Comments
 (0)