@@ -273,20 +273,18 @@ def get_dynamic_tensor_specs(cls,
273273 ep_size : int ) -> Tuple [DynamicTensorSpec , ...]:
274274 HIDDEN_STATES_IDX = 2
275275 TUNED_DIM = 0
276- MAX_PROFILE_BUCKET = 4096
277276
277+ # Extend max profiled bucket by ep_size
278+ MAX_PROFILE_BUCKET = 4096 * ep_size
278279 m_values = get_last_power_of_2_num_tokens_buckets (MAX_PROFILE_BUCKET )
279280
280- def round_rule (x : int , ep_size_ : int ) -> int :
281- return min (
282- last_positive_power_of_2 (x ) // ep_size_ , MAX_PROFILE_BUCKET )
281+ # 1/ep_size is the expected token fill rate
282+ # Fill rate maps buffer size into expected token count that represents actual works
283+ round_rule = lambda x : min (last_positive_power_of_2 (x // ep_size ),
284+ MAX_PROFILE_BUCKET )
283285
284- specs = (DynamicTensorSpec (
285- HIDDEN_STATES_IDX ,
286- TUNED_DIM ,
287- m_values ,
288- map_to_tuning_buckets = lambda x : round_rule (x , 1 ),
289- map_to_runtime_buckets = lambda x : round_rule (x , ep_size )), )
286+ specs = (DynamicTensorSpec (HIDDEN_STATES_IDX , TUNED_DIM , m_values ,
287+ round_rule ), )
290288
291289 return specs
292290
@@ -619,20 +617,18 @@ def get_dynamic_tensor_specs(cls,
619617 ep_size : int ) -> Tuple [DynamicTensorSpec , ...]:
620618 HIDDEN_STATES_IDX = 2
621619 TUNED_DIM = 0
622- MAX_PROFILE_BUCKET = 4096
623620
621+ # Extend max profiled bucket by ep_size
622+ MAX_PROFILE_BUCKET = 4096 * ep_size
624623 m_values = get_last_power_of_2_num_tokens_buckets (MAX_PROFILE_BUCKET )
625624
626- def round_rule (x : int , ep_size_ : int ) -> int :
627- return min (
628- last_positive_power_of_2 (x ) // ep_size_ , MAX_PROFILE_BUCKET )
625+ # 1/ep_size is the expected token fill rate
626+ # Fill rate maps buffer size into expected token count that represents actual works
627+ round_rule = lambda x : min (last_positive_power_of_2 (x // ep_size ),
628+ MAX_PROFILE_BUCKET )
629629
630- specs = (DynamicTensorSpec (
631- HIDDEN_STATES_IDX ,
632- TUNED_DIM ,
633- m_values ,
634- map_to_tuning_buckets = lambda x : round_rule (x , 1 ),
635- map_to_runtime_buckets = lambda x : round_rule (x , ep_size )), )
630+ specs = (DynamicTensorSpec (HIDDEN_STATES_IDX , TUNED_DIM , m_values ,
631+ round_rule ), )
636632
637633 return specs
638634
@@ -914,20 +910,18 @@ def get_dynamic_tensor_specs(cls,
914910 ep_size : int ) -> Tuple [DynamicTensorSpec , ...]:
915911 HIDDEN_STATES_IDX = 2
916912 TUNED_DIM = 0
917- MAX_PROFILE_BUCKET = 4096
918913
914+ # Extend max profiled bucket by ep_size
915+ MAX_PROFILE_BUCKET = 4096 * ep_size
919916 m_values = get_last_power_of_2_num_tokens_buckets (MAX_PROFILE_BUCKET )
920917
921- def round_rule (x : int , ep_size_ : int ) -> int :
922- return min (
923- last_positive_power_of_2 (x ) // ep_size_ , MAX_PROFILE_BUCKET )
918+ # 1/ep_size is the expected token fill rate
919+ # Fill rate maps buffer size into expected token count that represents actual works
920+ round_rule = lambda x : min (last_positive_power_of_2 (x // ep_size ),
921+ MAX_PROFILE_BUCKET )
924922
925- specs = (DynamicTensorSpec (
926- HIDDEN_STATES_IDX ,
927- TUNED_DIM ,
928- m_values ,
929- map_to_tuning_buckets = lambda x : round_rule (x , 1 ),
930- map_to_runtime_buckets = lambda x : round_rule (x , ep_size )), )
923+ specs = (DynamicTensorSpec (HIDDEN_STATES_IDX , TUNED_DIM , m_values ,
924+ round_rule ), )
931925
932926 return specs
933927
@@ -1214,20 +1208,18 @@ def get_dynamic_tensor_specs(cls,
12141208 ep_size : int ) -> Tuple [DynamicTensorSpec , ...]:
12151209 HIDDEN_STATES_IDX = 2
12161210 TUNED_DIM = 0
1217- MAX_PROFILE_BUCKET = 4096
12181211
1212+ # Extend max profiled bucket by ep_size
1213+ MAX_PROFILE_BUCKET = 4096 * ep_size
12191214 m_values = get_last_power_of_2_num_tokens_buckets (MAX_PROFILE_BUCKET )
12201215
1221- def round_rule (x : int , ep_size_ : int ) -> int :
1222- return min (
1223- last_positive_power_of_2 (x ) // ep_size_ , MAX_PROFILE_BUCKET )
1216+ # 1/ep_size is the expected token fill rate
1217+ # Fill rate maps buffer size into expected token count that represents actual works
1218+ round_rule = lambda x : min (last_positive_power_of_2 (x // ep_size ),
1219+ MAX_PROFILE_BUCKET )
12241220
1225- specs = (DynamicTensorSpec (
1226- HIDDEN_STATES_IDX ,
1227- TUNED_DIM ,
1228- m_values ,
1229- map_to_tuning_buckets = lambda x : round_rule (x , 1 ),
1230- map_to_runtime_buckets = lambda x : round_rule (x , ep_size )), )
1221+ specs = (DynamicTensorSpec (HIDDEN_STATES_IDX , TUNED_DIM , m_values ,
1222+ round_rule ), )
12311223
12321224 return specs
12331225
@@ -1492,20 +1484,18 @@ def get_dynamic_tensor_specs(cls,
14921484 ep_size : int ) -> Tuple [DynamicTensorSpec , ...]:
14931485 HIDDEN_STATES_IDX = 2
14941486 TUNED_DIM = 0
1495- MAX_PROFILE_BUCKET = 4096
14961487
1488+ # Extend max profiled bucket by ep_size
1489+ MAX_PROFILE_BUCKET = 4096 * ep_size
14971490 m_values = get_last_power_of_2_num_tokens_buckets (MAX_PROFILE_BUCKET )
14981491
1499- def round_rule (x : int , ep_size_ : int ) -> int :
1500- return min (
1501- last_positive_power_of_2 (x ) // ep_size_ , MAX_PROFILE_BUCKET )
1492+ # 1/ep_size is the expected token fill rate
1493+ # Fill rate maps buffer size into expected token count that represents actual works
1494+ round_rule = lambda x : min (last_positive_power_of_2 (x // ep_size ),
1495+ MAX_PROFILE_BUCKET )
15021496
1503- specs = (DynamicTensorSpec (
1504- HIDDEN_STATES_IDX ,
1505- TUNED_DIM ,
1506- m_values ,
1507- map_to_tuning_buckets = lambda x : round_rule (x , 1 ),
1508- map_to_runtime_buckets = lambda x : round_rule (x , ep_size )), )
1497+ specs = (DynamicTensorSpec (HIDDEN_STATES_IDX , TUNED_DIM , m_values ,
1498+ round_rule ), )
15091499
15101500 return specs
15111501
@@ -1755,20 +1745,18 @@ def get_dynamic_tensor_specs(cls,
17551745 ep_size : int ) -> Tuple [DynamicTensorSpec , ...]:
17561746 HIDDEN_STATES_IDX = 2
17571747 TUNED_DIM = 0
1758- MAX_PROFILE_BUCKET = 4096
17591748
1749+ # Extend max profiled bucket by ep_size
1750+ MAX_PROFILE_BUCKET = 4096 * ep_size
17601751 m_values = get_last_power_of_2_num_tokens_buckets (MAX_PROFILE_BUCKET )
17611752
1762- def round_rule (x : int , ep_size_ : int ) -> int :
1763- return min (
1764- last_positive_power_of_2 (x ) // ep_size_ , MAX_PROFILE_BUCKET )
1753+ # 1/ep_size is the expected token fill rate
1754+ # Fill rate maps buffer size into expected token count that represents actual works
1755+ round_rule = lambda x : min (last_positive_power_of_2 (x // ep_size ),
1756+ MAX_PROFILE_BUCKET )
17651757
1766- specs = (DynamicTensorSpec (
1767- HIDDEN_STATES_IDX ,
1768- TUNED_DIM ,
1769- m_values ,
1770- map_to_tuning_buckets = lambda x : round_rule (x , 1 ),
1771- map_to_runtime_buckets = lambda x : round_rule (x , ep_size )), )
1758+ specs = (DynamicTensorSpec (HIDDEN_STATES_IDX , TUNED_DIM , m_values ,
1759+ round_rule ), )
17721760
17731761 return specs
17741762
0 commit comments