remove caching in dynamicemb option

shijieliu · shijieliu · commit 37fee9777d07 · 2026-02-10T23:48:09.000-08:00
diff --git a/corelib/dynamicemb/benchmark/benchmark_batched_dynamicemb_tables.py b/corelib/dynamicemb/benchmark/benchmark_batched_dynamicemb_tables.py
@@ -322,7 +322,6 @@ def create_dynamic_embedding_tables(args, device):
                 score_strategy=DynamicEmbScoreStrategy.LFU
                 if args.cache_algorithm == "lfu"
                 else DynamicEmbScoreStrategy.TIMESTAMP,
-                caching=args.caching,
             )
         )
 
diff --git a/corelib/dynamicemb/dynamicemb/batched_dynamicemb_tables.py b/corelib/dynamicemb/dynamicemb/batched_dynamicemb_tables.py
@@ -524,48 +524,11 @@ def __init__(
                 dtype=self.embedding_dtype,
             )
         )
-        
-    def _enable_cache_decision(
-        self, caching, hbm_budget, value_size, capacity, bucket_capacity
-    ) -> Tuple[bool, int, bool]:
-        """
-        The principles are as follows:
-            1. Cannot exceed the budget
-            2. If the budget is too small, HBM is not necessary
-        """
-
-        # if not caching, it is the flag to represent the single table on device or host.
-        on_device = True
-
-        if not caching:
-            if hbm_budget == 0:
-                on_device = False
-            elif hbm_budget > 0 and hbm_budget < value_size * capacity:
-                # Caching is not enabled but HBM is not enough to hold the entire table, so adjust to caching mode
-                # to utilize the reserved HBM
-                caching = True
-
-        if caching:
-            # If the budget is too small, HBM is not necessary
-            if hbm_budget < bucket_capacity * value_size:
-                warnings.warn(
-                    "The HBM budget is too small to serve as a cache, fallback to host table.",
-                    UserWarning,
-                )
-                caching = False
-                on_device = False
-
-        if caching:
-            cache_capacity = hbm_budget // value_size
-        else:
-            cache_capacity = -1
-
-        return caching, cache_capacity, on_device
 
     def _create_cache_storage(self) -> None:
         self._storages: List[Storage] = []
         self._caches: List[Cache] = []
-        self._caching = self._dynamicemb_options[0].caching
+        self._caching = False
 
         for option in self._dynamicemb_options:
             if option.training and option.optimizer_type == OptimizerType.Null:
@@ -576,28 +539,39 @@ def _create_cache_storage(self) -> None:
                     "Set OptimizerType to Null as not on training mode.", UserWarning
                 )
 
-            bucket_capacity_for_cache = 1024
-
-            caching, cache_capacity, on_device = self._enable_cache_decision(
-                option.caching,
-                option.local_hbm_for_values,
-                get_value_size(
-                    option.embedding_dtype,
-                    option.dim,
-                    option.optimizer_type,
-                ),
-                option.max_capacity,
-                bucket_capacity_for_cache,
+            value_size = get_value_size(
+                option.embedding_dtype,
+                option.dim,
+                option.optimizer_type,
             )
+            total_table_bytes = value_size * option.max_capacity
+            hbm_budget = option.local_hbm_for_values
+
+            if hbm_budget == 0:
+                # No HBM budget -> storage only, on host
+                option.local_hbm_for_values = 0
+                self._caches.append(None)
+                self._storages.append(DynamicEmbeddingTable(option, self._optimizer))
+            elif total_table_bytes <= hbm_budget:
+                # Entire table fits in HBM -> single table on GPU serves as
+                # both cache and storage (no eviction needed).
+                self._caching = True
+                table = DynamicEmbeddingTable(option, self._optimizer)
+                self._caches.append(table)
+                self._storages.append(table)
+            else:
+                # Partial HBM -> cache (GPU) + storage (host or external)
+                self._caching = True
+                bucket_capacity_for_cache = 1024
+                cache_capacity = hbm_budget // value_size
 
-            if caching:
                 cache_option = deepcopy(option)
                 cache_option.bucket_capacity = bucket_capacity_for_cache
-
                 cache_option.max_capacity = cache_capacity
                 cache_option.init_capacity = cache_capacity
-
-                self._caches.append(DynamicEmbeddingTable(cache_option, self._optimizer))
+                self._caches.append(
+                    DynamicEmbeddingTable(cache_option, self._optimizer)
+                )
 
                 storage_option = deepcopy(option)
                 storage_option.local_hbm_for_values = 0
@@ -607,11 +581,6 @@ def _create_cache_storage(self) -> None:
                     if PS
                     else DynamicEmbeddingTable(storage_option, self._optimizer)
                 )
-            else:
-                self._caches.append(None)
-                if not on_device:
-                    option.local_hbm_for_values = 0
-                self._storages.append(DynamicEmbeddingTable(option, self._optimizer))
 
         _print_memory_consume(
             self._table_names, self._dynamicemb_options, self._optimizer, self.device_id
diff --git a/corelib/dynamicemb/dynamicemb/dynamicemb_config.py b/corelib/dynamicemb/dynamicemb/dynamicemb_config.py
@@ -222,22 +222,6 @@ class DynamicEmbTableOptions(_ContextOptions):
         For `UNIFORM` and `TRUNCATED_NORMAL`, the `lower` and `upper` will set to $\pm {1 \over \sqrt{EmbeddingConfig.num\_embeddings}}$.
     eval_initializer_args: DynamicEmbInitializerArgs
         The initializer args for evaluation mode, and will return torch.zeros(...) as embedding by default if index/sparse feature is missing.
-    caching: bool
-        Flag to indicate dynamic embedding tables is working on caching mode, default to `False`.
-        When the device memory on a single GPU is insufficient to accommodate a single shard of the dynamic embedding table,
-            dynamicemb supports the mixed use of device memory and host memory(pinned memory).
-        But by default, the values of the entire table are concatenated with device memory and host memory.
-        This means that the storage location of one embedding is determined by `hash_function(key)`, and mapping to device memory will bring better lookup performance.
-        However, sparse features in training are often with temporal locality.
-        In order to store hot keys in device memory, dynamicemb creates two table instances,
-            whose values are stored in device memory and host memory respectively, and store hot keys on the GPU table priorily.
-        If the GPU table is full, the evicted keys will be inserted into the host table.
-        If the host table is also full, the key will be evicted(all the eviction is based on the score per key).
-        The original intention of eviction is based on this insight: features that only appear once should not occupy memory(even host memory) for a long time.
-        In short:
-            set **`caching=True`** will create a GPU table and a host table, and make GPU table serves as a cache;
-            set **`caching=False`** will create a hybrid table which use GPU and host memory in a concatenated way to store value.
-            All keys and other meta data are always stored on GPU for both cases.
     init_capacity : Optional[int], optional
         The initial capacity of the table. If not set, it defaults to max_capacity after sharding.
         If `init_capacity` is provided, it will serve as the initial table capacity on a single GPU.
@@ -265,9 +249,9 @@ class DynamicEmbTableOptions(_ContextOptions):
         Please refer to the API documentation for DynamicEmbCheckMode for more information.
     global_hbm_for_values : int
         Total GPU memory allocated to store embedding + optimizer states, in bytes. Default is 0.
-        It has different meanings under `caching=True` and  `caching=False`.
-            When `caching=False`, it decides how much GPU memory is in the total memory to store value in a single hybrid table.
-            When `caching=True`, it decides the table capacity of the GPU table.
+        If the budget can hold the entire table (max_capacity * value_size), the table lives entirely on GPU.
+        If the budget is nonzero but smaller, it determines the GPU cache capacity while the full table is stored on host/external storage.
+        If zero, the table is stored entirely on host memory.
     external_storage: Storage
         The external storage/ParamterServer which inherits the interface of Storage, and can be configured per table.
         If not provided, will using DynamicEmbeddingTable as the Storage.
@@ -297,7 +281,6 @@ class DynamicEmbTableOptions(_ContextOptions):
             value=0.0,
         )
     )
-    caching: bool = False
     init_capacity: Optional[
         int
     ] = None  # if not set then set to max_capcacity after sharded
@@ -339,7 +322,6 @@ def __ne__(self, other):
     def get_grouped_key(self):
         grouped_key = {}
         grouped_key["training"] = self.training
-        grouped_key["caching"] = self.caching
         grouped_key["external_storage"] = self.external_storage
         grouped_key["index_type"] = self.index_type
         grouped_key["score_strategy"] = self.score_strategy
diff --git a/corelib/dynamicemb/example/example.py b/corelib/dynamicemb/example/example.py
@@ -521,7 +521,6 @@ def get_planner(
                     mode=DynamicEmbInitializerMode.NORMAL
                 ),
                 score_strategy=DynamicEmbScoreStrategy.STEP,
-                caching=caching,
                 training=training,
                 admit_strategy=admit_strategy,
                 admission_counter=admission_counter,
@@ -596,7 +595,7 @@ def apply_dmp(model, args, training):
         args.batch_size,
         optimizer_type=optimizer_type,
         training=training,
-        caching=args.caching,
+        caching=args.caching,  # used for HBM budget calculation, not passed to options
         args=args,
     )
     # get plan for all ranks.
diff --git a/corelib/dynamicemb/test/test_batched_dynamic_embedding_tables_v2.py b/corelib/dynamicemb/test/test_batched_dynamic_embedding_tables_v2.py
@@ -482,7 +482,6 @@ def test_forward_train_eval(
             embedding_dtype=value_type,
             device_id=device_id,
             score_strategy=DynamicEmbScoreStrategy.TIMESTAMP,
-            caching=caching,
             local_hbm_for_values=1024**3,
             external_storage=PS,
         )
@@ -648,7 +647,6 @@ def test_backward(opt_type, opt_params, caching, pooling_mode, dims, determinist
             embedding_dtype=value_type,
             device_id=device_id,
             score_strategy=DynamicEmbScoreStrategy.TIMESTAMP,
-            caching=caching,
             local_hbm_for_values=1024**3,
             external_storage=PS,
         )
@@ -791,7 +789,6 @@ def test_prefetch_flush_in_cache(opt_type, opt_params, deterministic, PS):
             embedding_dtype=value_type,
             device_id=device_id,
             score_strategy=DynamicEmbScoreStrategy.STEP,
-            caching=True,
             local_hbm_for_values=1024**3,
             external_storage=PS,
         )
@@ -976,7 +973,6 @@ def test_deterministic_insert(opt_type, opt_params, caching, PS, iteration, batc
             embedding_dtype=value_type,
             device_id=device_id,
             score_strategy=DynamicEmbScoreStrategy.TIMESTAMP,
-            caching=caching,
             local_hbm_for_values=init_capacity * dim * 4,
             external_storage=PS,
         )
@@ -1093,7 +1089,6 @@ def test_empty_batch(opt_type, opt_params, dim, caching, deterministic, PS):
             embedding_dtype=value_type,
             device_id=device_id,
             score_strategy=DynamicEmbScoreStrategy.TIMESTAMP,
-            caching=caching,
             local_hbm_for_values=1024**3,
             external_storage=PS,
         )
diff --git a/corelib/dynamicemb/test/unit_tests/incremental_dump/test_batched_dynamicemb_tables.py b/corelib/dynamicemb/test/unit_tests/incremental_dump/test_batched_dynamicemb_tables.py
@@ -133,7 +133,6 @@ def test_without_eviction(
             local_hbm_for_values=1024**3,
             score_strategy=score_strategy,
             num_of_buckets_per_alloc=num_embeddings[i] // bucket_capacity,
-            caching=caching,
         )
         for i in range(table_num)
     ]
diff --git a/corelib/dynamicemb/test/unit_tests/test_embedding_admission.py b/corelib/dynamicemb/test/unit_tests/test_embedding_admission.py
@@ -311,8 +311,7 @@ def test_admission_strategy_validation(
             score_strategy
         ),  # Use timestamp for admission
         use_index_dedup=use_index_dedup,
-        caching=caching,
-        cache_capacity_ratio=cache_capacity_ratio if caching else 0.1,
+        cache_capacity_ratio=cache_capacity_ratio if caching else 1.0,
         admit_strategy=admission_strategy,  # Pass admission strategy
     )
 
diff --git a/corelib/dynamicemb/test/unit_tests/test_embedding_dump_load.py b/corelib/dynamicemb/test/unit_tests/test_embedding_dump_load.py
@@ -199,7 +199,6 @@ def apply_dmp(
     device: torch.device,
     score_strategy: DynamicEmbScoreStrategy = DynamicEmbScoreStrategy.LFU,
     use_index_dedup: bool = False,
-    caching: bool = False,
     cache_capacity_ratio: float = 0.5,
     admit_strategy: AdmissionStrategy = None,
 ):
@@ -213,11 +212,7 @@ def apply_dmp(
                 tmp_type = eb_config.data_type
 
                 embedding_type_bytes = DATA_TYPE_NUM_BITS[tmp_type] / 8
-                emb_num_embeddings = (
-                    eb_config.num_embeddings * cache_capacity_ratio
-                    if caching
-                    else eb_config.num_embeddings
-                )
+                emb_num_embeddings = eb_config.num_embeddings
                 emb_num_embeddings_next_power_of_2 = 2 ** math.ceil(
                     math.log2(emb_num_embeddings)
                 )  # HKV need embedding vector num is power of 2
@@ -249,12 +244,20 @@ def apply_dmp(
                     else 0
                 )
 
-                # Include optimizer state in HBM calculation
-                total_hbm_need = (
+                # Include optimizer state in HBM calculation.
+                # When cache_capacity_ratio < 1, scale down so that only a
+                # fraction of the table fits in HBM (triggers cache+storage).
+                # When cache_capacity_ratio >= 1, use full size (all-HBM mode).
+                full_table_hbm = (
                     embedding_type_bytes
                     * (dim + optimizer_state_dim)
                     * emb_num_embeddings_next_power_of_2
                 )
+                total_hbm_need = int(
+                    full_table_hbm * cache_capacity_ratio
+                    if cache_capacity_ratio < 1.0
+                    else full_table_hbm
+                )
 
                 admission_counter = KVCounter(
                     max(1024 * 1024, emb_num_embeddings_next_power_of_2 // 4)
@@ -268,7 +271,6 @@ def apply_dmp(
                     ),
                     bucket_capacity=emb_num_embeddings_next_power_of_2,
                     max_capacity=emb_num_embeddings_next_power_of_2,
-                    caching=caching,
                     local_hbm_for_values=1024**3,
                     admit_strategy=admit_strategy,
                     admission_counter=admission_counter,
@@ -308,7 +310,6 @@ def create_model(
     optimizer_kwargs: Dict[str, Any],
     score_strategy: DynamicEmbScoreStrategy = DynamicEmbScoreStrategy.LFU,
     use_index_dedup: bool = False,
-    caching: bool = False,
     cache_capacity_ratio: float = 0.5,
     admit_strategy: AdmissionStrategy = None,
 ):
@@ -344,7 +345,6 @@ def create_model(
         torch.device(f"cuda:{torch.cuda.current_device()}"),
         score_strategy=score_strategy,
         use_index_dedup=use_index_dedup,
-        caching=caching,
         cache_capacity_ratio=cache_capacity_ratio,
         admit_strategy=admit_strategy,
     )
diff --git a/corelib/dynamicemb/test/unit_tests/test_lfu_scores.py b/corelib/dynamicemb/test/unit_tests/test_lfu_scores.py
@@ -270,8 +270,7 @@ def test_lfu_score_validation(
         optimizer_kwargs=optimizer_kwargs,
         score_strategy=DynamicEmbScoreStrategy.LFU,
         use_index_dedup=use_index_dedup,
-        caching=caching,
-        cache_capacity_ratio=cache_capacity_ratio if caching else 0.1,
+        cache_capacity_ratio=cache_capacity_ratio if caching else 1.0,
     )
 
     # Generate features with frequency tracking
diff --git a/examples/hstu/test_utils.py b/examples/hstu/test_utils.py
@@ -577,8 +577,6 @@ def create_model(
             "item": DynamicEmbTableOptions(
                 global_hbm_for_values=1024 * 1024,  # 1M HBM (maybe cached)
                 score_strategy=DynamicEmbScoreStrategy.STEP,
-                caching=pipeline_type
-                == "prefetch",  # when prefetch is enabled, we must enable caching
             ),
         }
         if use_dynamic_emb
diff --git a/examples/hstu/training/pretrain_gr_ranking.py b/examples/hstu/training/pretrain_gr_ranking.py
@@ -80,9 +80,7 @@ def main():
     args = parser.parse_args()
     gin.parse_config_file(args.gin_config_file)
     trainer_args = TrainerArgs()
-    dataset_args, embedding_args = get_dataset_and_embedding_args(
-        trainer_args.pipeline_type == "prefetch"
-    )
+    dataset_args, embedding_args = get_dataset_and_embedding_args()
     network_args = NetworkArgs()
     optimizer_args = OptimizerArgs()
     tp_args = TensorModelParallelArgs()
diff --git a/examples/hstu/training/pretrain_gr_retrieval.py b/examples/hstu/training/pretrain_gr_retrieval.py
@@ -77,9 +77,7 @@ def main():
     args = parser.parse_args()
     gin.parse_config_file(args.gin_config_file)
     trainer_args = TrainerArgs()
-    dataset_args, embedding_args = get_dataset_and_embedding_args(
-        caching=trainer_args.pipeline_type == "prefetch"
-    )
+    dataset_args, embedding_args = get_dataset_and_embedding_args()
     network_args = NetworkArgs()
     optimizer_args = OptimizerArgs()
     tp_args = TensorModelParallelArgs()
diff --git a/examples/hstu/training/trainer/utils.py b/examples/hstu/training/trainer/utils.py
diff --git a/examples/hstu/utils/gin_config_args.py b/examples/hstu/utils/gin_config_args.py

Original file line number	Diff line number	Diff line change
`@@ -322,7 +322,6 @@ def create_dynamic_embedding_tables(args, device):`
`322`	`322`	`score_strategy=DynamicEmbScoreStrategy.LFU`
`323`	`323`	`if args.cache_algorithm == "lfu"`
`324`	`324`	`else DynamicEmbScoreStrategy.TIMESTAMP,`
`325`		`- caching=args.caching,`
`326`	`325`	`)`
`327`	`326`	`)`
`328`	`327`
Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,6 @@ def test_without_eviction(`
`133`	`133`	`local_hbm_for_values=1024**3,`
`134`	`134`	`score_strategy=score_strategy,`
`135`	`135`	`num_of_buckets_per_alloc=num_embeddings[i] // bucket_capacity,`
`136`		`- caching=caching,`
`137`	`136`	`)`
`138`	`137`	`for i in range(table_num)`
`139`	`138`	`]`
Original file line number	Diff line number	Diff line change
`@@ -311,8 +311,7 @@ def test_admission_strategy_validation(`
`311`	`311`	`score_strategy`
`312`	`312`	`), # Use timestamp for admission`
`313`	`313`	`use_index_dedup=use_index_dedup,`
`314`		`- caching=caching,`
`315`		`- cache_capacity_ratio=cache_capacity_ratio if caching else 0.1,`
	`314`	`+ cache_capacity_ratio=cache_capacity_ratio if caching else 1.0,`
`316`	`315`	`admit_strategy=admission_strategy, # Pass admission strategy`
`317`	`316`	`)`
`318`	`317`
Original file line number	Diff line number	Diff line change
`@@ -270,8 +270,7 @@ def test_lfu_score_validation(`
`270`	`270`	`optimizer_kwargs=optimizer_kwargs,`
`271`	`271`	`score_strategy=DynamicEmbScoreStrategy.LFU,`
`272`	`272`	`use_index_dedup=use_index_dedup,`
`273`		`- caching=caching,`
`274`		`- cache_capacity_ratio=cache_capacity_ratio if caching else 0.1,`
	`273`	`+ cache_capacity_ratio=cache_capacity_ratio if caching else 1.0,`
`275`	`274`	`)`
`276`	`275`
`277`	`276`	`# Generate features with frequency tracking`
Original file line number	Diff line number	Diff line change
`@@ -577,8 +577,6 @@ def create_model(`
`577`	`577`	`"item": DynamicEmbTableOptions(`
`578`	`578`	`global_hbm_for_values=1024 * 1024, # 1M HBM (maybe cached)`
`579`	`579`	`score_strategy=DynamicEmbScoreStrategy.STEP,`
`580`		`- caching=pipeline_type`
`581`		`- == "prefetch", # when prefetch is enabled, we must enable caching`
`582`	`580`	`),`
`583`	`581`	`}`
`584`	`582`	`if use_dynamic_emb`