@@ -112,9 +112,7 @@ class EngineArgs:
112112 pipeline_parallel_size : int = 1
113113 tensor_parallel_size : int = 1
114114 max_parallel_loading_workers : Optional [int ] = None
115- # NOTE(kzawora): default block size for Gaudi should be 128
116- # smaller sizes still work, but very inefficiently
117- block_size : int = 16 if not current_platform .is_hpu () else 128
115+ block_size : Optional [int ] = None
118116 enable_prefix_caching : Optional [bool ] = None
119117 disable_sliding_window : bool = False
120118 use_v2_block_manager : bool = True
@@ -1036,9 +1034,7 @@ def create_engine_config(self,
10361034 self .enable_prefix_caching = False
10371035
10381036 cache_config = CacheConfig (
1039- # neuron needs block_size = max_model_len
1040- block_size = self .block_size if self .device != "neuron" else
1041- (self .max_model_len if self .max_model_len is not None else 0 ),
1037+ block_size = self .block_size ,
10421038 gpu_memory_utilization = self .gpu_memory_utilization ,
10431039 swap_space = self .swap_space ,
10441040 cache_dtype = self .kv_cache_dtype ,
0 commit comments