Fix GPU deadlock caused by m_next_global bottleneck in adaptive cache bypass

claude · claude · commit c51e30b55dc8 · 2025-11-30T08:51:35.000Z
Root cause:
- Original implementation checked prediction_table at response stage
- When multiple requests with high miss-rate returned, only one could enter
  m_next_global (single slot), others got stuck in response_fifo
- This caused response_fifo to fill up, blocking interconnect and causing deadlock

Solution:
- Move prediction-based bypass decision from response stage to request stage
- Check prediction_table in memory_cycle() before sending request to L1D
- Requests with prediction_table[pc] &gt;= 8 now bypass L1D entirely
- Response handling uses normal global memory path with adequate buffering

Benefits:
- Eliminates m_next_global bottleneck
- Avoids unnecessary L1D accesses for predicted-miss requests
- Aligns with adaptive bypass paper's design intent
- Maintains prediction_table update logic
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
@@ -2277,6 +2277,18 @@ bool ldst_unit::memory_cycle(warp_inst_t &inst,
     // skip L1 cache if the option is enabled
     if (m_core->get_config()->gmem_skip_L1D && (CACHE_L1 != inst.cache_op))
       bypassL1D = true;
+
+    // cwpeng: adaptive bypass based on prediction table
+    // Check prediction table BEFORE sending request, not after response returns
+    if (!inst.is_store() && m_L1D != NULL) {  // Only for global read
+      uint8_t hashed_pc = (uint8_t)inst.pc;
+      if (m_L1D->prediction_table[hashed_pc] >= 8) {
+        bypassL1D = true;
+        // Uncomment for debugging:
+        // printf("Bypass L1D on request (predict high miss rate) pc:%u, pred:%u\n",
+        //        hashed_pc, m_L1D->prediction_table[hashed_pc]);
+      }
+    }
   }
   if (bypassL1D) {
     // bypass L1 cache
@@ -2869,20 +2881,19 @@ void ldst_unit::cycle() {
                                       // on load miss only
 
         bool bypassL1D = false;
-        uint8_t temp_pc = 0; //cwpeng
-        address_type currPC = mf->get_pc();
-        temp_pc = (currPC == -1) ? (uint8_t) mf->get_original_mf()->get_pc() : (uint8_t) currPC;
+        // cwpeng: Removed prediction-based bypass from response stage
+        // Bypass decision is now made at request stage (in memory_cycle)
+        // to avoid m_next_global bottleneck and deadlock issues
 
         if (CACHE_GLOBAL == mf->get_inst().cache_op || (m_L1D == NULL)) {
           bypassL1D = true;
         } else if (mf->get_access_type() == GLOBAL_ACC_R ||
                    mf->get_access_type() ==
                        GLOBAL_ACC_W) {  // global memory access
           if (m_core->get_config()->gmem_skip_L1D) bypassL1D = true;
-          if (m_L1D->prediction_table[temp_pc] >= 8 && mf->get_access_type() == GLOBAL_ACC_R){
-            bypassL1D = true;
-            printf("Bypass L1D due to high miss rate prediction pc:%u, pred:%u\n", temp_pc, m_L1D->prediction_table[temp_pc]);
-          }
+          // REMOVED: prediction-based bypass logic from here
+          // The bypass decision based on prediction_table is now made earlier
+          // in memory_cycle() before the request is sent to L1D/L2
         }
         if (bypassL1D) {
           if (m_next_global == NULL) {