NVIDIA · shijieliu · Jan 20, 2026 · Feb 11, 2026
diff --git a/corelib/dynamicemb/benchmark/benchmark_batched_dynamicemb_tables.py b/corelib/dynamicemb/benchmark/benchmark_batched_dynamicemb_tables.py
@@ -322,7 +322,6 @@ def create_dynamic_embedding_tables(args, device):
                 score_strategy=DynamicEmbScoreStrategy.LFU
                 if args.cache_algorithm == "lfu"
                 else DynamicEmbScoreStrategy.TIMESTAMP,
-                caching=args.caching,
             )
         )
 

diff --git a/corelib/dynamicemb/dynamicemb/batched_dynamicemb_tables.py b/corelib/dynamicemb/dynamicemb/batched_dynamicemb_tables.py
@@ -528,7 +528,7 @@ def __init__(
     def _create_cache_storage(self) -> None:
         self._storages: List[Storage] = []
         self._caches: List[Cache] = []
-        self._caching = self._dynamicemb_options[0].caching
+        self._caching = False
 
         for option in self._dynamicemb_options:
             if option.training and option.optimizer_type == OptimizerType.Null:
@@ -539,23 +539,36 @@ def _create_cache_storage(self) -> None:
                     "Set OptimizerType to Null as not on training mode.", UserWarning
                 )
 
-            if option.caching and option.training:
-                cache_option = deepcopy(option)
-                cache_option.bucket_capacity = 1024
-                capacity = get_constraint_capacity(
-                    option.local_hbm_for_values,
-                    option.embedding_dtype,
-                    option.dim,
-                    option.optimizer_type,
-                    cache_option.bucket_capacity,
-                )
-                if capacity == 0:
-                    raise ValueError(
-                        "Can't use caching mode as the reserved HBM size is too small."
-                    )
+            value_size = get_value_size(
+                option.embedding_dtype,
+                option.dim,
+                option.optimizer_type,
+            )
+            total_table_bytes = value_size * option.max_capacity
+            hbm_budget = option.local_hbm_for_values
+
+            if hbm_budget == 0:
+                # No HBM budget -> storage only, on host
+                option.local_hbm_for_values = 0
+                self._caches.append(None)
+                self._storages.append(DynamicEmbeddingTable(option, self._optimizer))
+            elif total_table_bytes <= hbm_budget:
+                # Entire table fits in HBM -> single table on GPU serves as
+                # both cache and storage (no eviction needed).
+                self._caching = True
+                table = DynamicEmbeddingTable(option, self._optimizer)
+                self._caches.append(table)
+                self._storages.append(table)
+            else:
+                # Partial HBM -> cache (GPU) + storage (host or external)
+                self._caching = True
+                bucket_capacity_for_cache = 1024
+                cache_capacity = hbm_budget // value_size
 
-                cache_option.max_capacity = capacity
-                cache_option.init_capacity = capacity
+                cache_option = deepcopy(option)
+                cache_option.bucket_capacity = bucket_capacity_for_cache
+                cache_option.max_capacity = cache_capacity
+                cache_option.init_capacity = cache_capacity
                 self._caches.append(
                     DynamicEmbeddingTable(cache_option, self._optimizer)
                 )
@@ -568,9 +581,6 @@ def _create_cache_storage(self) -> None:
                     if PS
                     else DynamicEmbeddingTable(storage_option, self._optimizer)
                 )
-            else:
-                self._caches.append(None)
-                self._storages.append(DynamicEmbeddingTable(option, self._optimizer))
 
         _print_memory_consume(
             self._table_names, self._dynamicemb_options, self._optimizer, self.device_id

diff --git a/corelib/dynamicemb/dynamicemb/dynamicemb_config.py b/corelib/dynamicemb/dynamicemb/dynamicemb_config.py
@@ -222,22 +222,6 @@ class DynamicEmbTableOptions(_ContextOptions):
         For `UNIFORM` and `TRUNCATED_NORMAL`, the `lower` and `upper` will set to $\pm {1 \over \sqrt{EmbeddingConfig.num\_embeddings}}$.
     eval_initializer_args: DynamicEmbInitializerArgs
         The initializer args for evaluation mode, and will return torch.zeros(...) as embedding by default if index/sparse feature is missing.
-    caching: bool
-        Flag to indicate dynamic embedding tables is working on caching mode, default to `False`.
-        When the device memory on a single GPU is insufficient to accommodate a single shard of the dynamic embedding table,
-            dynamicemb supports the mixed use of device memory and host memory(pinned memory).
-        But by default, the values of the entire table are concatenated with device memory and host memory.
-        This means that the storage location of one embedding is determined by `hash_function(key)`, and mapping to device memory will bring better lookup performance.
-        However, sparse features in training are often with temporal locality.
-        In order to store hot keys in device memory, dynamicemb creates two table instances,
-            whose values are stored in device memory and host memory respectively, and store hot keys on the GPU table priorily.
-        If the GPU table is full, the evicted keys will be inserted into the host table.
-        If the host table is also full, the key will be evicted(all the eviction is based on the score per key).
-        The original intention of eviction is based on this insight: features that only appear once should not occupy memory(even host memory) for a long time.
-        In short:
-            set **`caching=True`** will create a GPU table and a host table, and make GPU table serves as a cache;
-            set **`caching=False`** will create a hybrid table which use GPU and host memory in a concatenated way to store value.
-            All keys and other meta data are always stored on GPU for both cases.
     init_capacity : Optional[int], optional
         The initial capacity of the table. If not set, it defaults to max_capacity after sharding.
         If `init_capacity` is provided, it will serve as the initial table capacity on a single GPU.
@@ -265,9 +249,9 @@ class DynamicEmbTableOptions(_ContextOptions):
         Please refer to the API documentation for DynamicEmbCheckMode for more information.
     global_hbm_for_values : int
         Total GPU memory allocated to store embedding + optimizer states, in bytes. Default is 0.
-        It has different meanings under `caching=True` and  `caching=False`.
-            When `caching=False`, it decides how much GPU memory is in the total memory to store value in a single hybrid table.
-            When `caching=True`, it decides the table capacity of the GPU table.
+        If the budget can hold the entire table (max_capacity * value_size), the table lives entirely on GPU.
+        If the budget is nonzero but smaller, it determines the GPU cache capacity while the full table is stored on host/external storage.
+        If zero, the table is stored entirely on host memory.
     external_storage: Storage
         The external storage/ParamterServer which inherits the interface of Storage, and can be configured per table.
         If not provided, will using DynamicEmbeddingTable as the Storage.
@@ -297,7 +281,6 @@ class DynamicEmbTableOptions(_ContextOptions):
             value=0.0,
         )
     )
-    caching: bool = False
     init_capacity: Optional[
         int
     ] = None  # if not set then set to max_capcacity after sharded
@@ -339,7 +322,6 @@ def __ne__(self, other):
     def get_grouped_key(self):
         grouped_key = {}
         grouped_key["training"] = self.training
-        grouped_key["caching"] = self.caching
         grouped_key["external_storage"] = self.external_storage
         grouped_key["index_type"] = self.index_type
         grouped_key["score_strategy"] = self.score_strategy
@@ -498,6 +480,30 @@ def validate_initializer_args(
             initializer_args.upper = default_upper
 
 
+def get_comsued_bytes_of_table(
+    max_capacity,
+    dtype,
+    dim,
+    optimizer_type,
+) -> int:
+    byte_consume_per_vector = (
+        dim + get_optimizer_state_dim(optimizer_type, dim, dtype)
+    ) * dtype_to_bytes(dtype)
+    total_consumed = max_capacity * byte_consume_per_vector
+    return total_consumed
+
+
+def get_value_size(
+    dtype,
+    dim,
+    optimizer_type,
+) -> int:
+    byte_consume_per_vector = (
+        dim + get_optimizer_state_dim(optimizer_type, dim, dtype)
+    ) * dtype_to_bytes(dtype)
+    return byte_consume_per_vector
+
+
 def get_constraint_capacity(
     memory_bytes,
     dtype,