change from first element to a random element for cache missing items (#4955)

emlin · meta-codesync[bot] · commit 4a887135d2f6 · 2025-10-05T23:50:34.000-07:00
Summary: Pull Request resolved: #4955 X-link: https://github.com/facebookresearch/FBGEMM/pull/1974 In inference zch backend, we cannot use initializer to randomized init value for cache missing items, as the intializer does not work in parallel read and write mode. The current behavior is to always get the first item in hash map, but that has less randmization. This diff added a randmization for cache missing ids, also add a log to show the missing ids in every batch. update: - changed back to use hashmap size, instead of allocated block for randomization base - check if the block is used the reason is that, there could be blocks allocated, but never being used, in that case, if we choose that value, it will return all 0s. Verified the new change in VG, and no performance reggression. Reviewed By: EddyLXJ, steven1327 Differential Revision: D83612329 fbshipit-source-id: 23e7f0d1e249c9a3117800c6f992104900fca748
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_inference_embedding.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_inference_embedding.h
@@ -22,6 +22,7 @@
 #include <thrift/lib/cpp2/protocol/CompactProtocol.h>
 #include <thrift/lib/cpp2/protocol/Serializer.h>
 #include <torch/script.h>
+#include <random>
 #include "common/time/Time.h"
 
 #include "../ssd_split_embeddings_cache/initializer.h"
@@ -419,9 +420,36 @@ class DramKVInferenceEmbedding {
                           before_read_lock_ts;
 
                       if (!wlmap->empty()) {
-                        row_storage_data_ptr =
-                            FixedBlockPool::data_ptr<weight_type>(
-                                wlmap->begin()->second);
+                        // Simple block-based randomization using get_block with
+                        // cursor
+                        auto* pool = kv_store_.pool_by(shard_id);
+
+                        // Random starting cursor based on map size for good
+                        // entropy
+                        size_t random_start =
+                            folly::Random::rand32(wlmap->size());
+
+                        // Try to find a used block starting from random
+                        // position
+                        weight_type* block = nullptr;
+                        for (int attempts = 0; attempts < 16; ++attempts) {
+                          block = pool->template get_block<weight_type>(
+                              random_start + attempts);
+                          if (block != nullptr) {
+                            // Block is used (not null)
+                            row_storage_data_ptr =
+                                FixedBlockPool::data_ptr<weight_type>(block);
+                            break;
+                          }
+                        }
+
+                        // Fallback: if no used block found, use first element
+                        // from map
+                        if (block == nullptr) {
+                          row_storage_data_ptr =
+                              FixedBlockPool::data_ptr<weight_type>(
+                                  wlmap->begin()->second);
+                        }
                       } else {
                         const auto& init_storage =
                             initializers_[shard_id]->row_storage_;
@@ -526,7 +554,9 @@ class DramKVInferenceEmbedding {
                   read_lookup_cache_total_duration / num_shards_;
               read_acquire_lock_avg_duration_ +=
                   read_acquire_lock_total_duration / num_shards_;
-              read_missing_load_avg_ += read_missing_load / num_shards_;
+              LOG_EVERY_MS(INFO, 5000)
+                  << "get_kv_db_async total read_missing_load per batch: "
+                  << read_missing_load;
               return std::vector<folly::Unit>(results.size());
             });
   };
diff --git a/fbgemm_gpu/test/tbe/dram_kv/dram_kv_inference_test.py b/fbgemm_gpu/test/tbe/dram_kv/dram_kv_inference_test.py
@@ -249,3 +249,67 @@ def reader_thread() -> None:  # pyre-ignore
         self.assertTrue(equal_one_of(embs[5, :4], possible_embs))
         reader_thread.join()
         self.assertFalse(reader_failed_event.is_set())
+
+    def test_randomized_cache_miss_initialization(self) -> None:
+        """Test that cache misses use randomized data from existing blocks."""
+        num_shards = 8
+        uniform_init_lower: float = -0.01
+        uniform_init_upper: float = 0.01
+
+        # Create DRAM KV inference cache
+        kv_embedding_cache = torch.classes.fbgemm.DramKVEmbeddingInferenceWrapper(
+            num_shards, uniform_init_lower, uniform_init_upper
+        )
+        kv_embedding_cache.init(
+            [(32, 4, SparseType.FP16.as_int())],
+            32,
+            4,
+            torch.tensor([0, 100], dtype=torch.int64),
+        )
+
+        # Setup: Populate the cache with many initial values for better randomization diversity
+        # Use 400 setup items to ensure each shard (8 shards) gets ~50 entries for good randomization
+        setup_indices = torch.arange(0, 400, dtype=torch.int64)  # 400 setup items
+        setup_weights = torch.randint(
+            1, 255, (400, 32), dtype=torch.uint8
+        )  # Non-zero values to ensure randomization source
+        print(f"setup_weights: {setup_weights}")
+
+        # Populate cache
+        kv_embedding_cache.set_embeddings(setup_indices, setup_weights)
+
+        # Execute: Request cache misses multiple times - these should get randomized initialization
+        # Use indices outside the range [0, 399] to ensure they are actual cache misses
+        miss_indices = torch.tensor([500, 501, 502, 503, 504], dtype=torch.int64)
+
+        # Get the cache miss results multiple times to check for randomization
+        results = []
+        for _ in range(5):
+            current_output = kv_embedding_cache.get_embeddings(miss_indices)
+            results.append(current_output.clone())
+
+        # Assert: Verify that randomization occurs
+        # The results should not all be identical if randomization is working
+        all_identical = True
+        for i in range(1, len(results)):
+            if not torch.equal(
+                results[0][:, :4], results[i][:, :4]
+            ):  # Only check first 4 columns (actual data)
+                all_identical = False
+                break
+
+        # Since we're using randomization, results should be different
+        # Note: There's a small chance they could be identical by random chance,
+        # but with 5 trials of 5 vectors of 4 bytes, this is extremely unlikely
+        self.assertFalse(
+            all_identical,
+            "Randomized cache miss initialization should produce different results",
+        )
+
+        # All results should be non-zero (since we populated the cache with non-zero random values)
+        for result in results:
+            # Check that at least some values are non-zero (indicating data came from existing blocks)
+            self.assertTrue(
+                torch.any(result[:, :4] != 0),
+                "Cache miss results should contain non-zero values when cache has data",
+            )