vllm-project
diff --git a/‎vllm_gaudi/extension/bucketing/exponential.py
Lines changed: 29 additions & 78 deletions b/‎vllm_gaudi/extension/bucketing/exponential.py
Lines changed: 29 additions & 78 deletions
@@ -65,13 +65,13 @@ def get_decode_buckets(self, max_num_seqs, block_size,
                            num_max_blocks):
         self.check_for_user_flags('decode')
         prefix_caching = get_config().prefix_caching
-        max_blocks = num_max_blocks
 
         # cfgs shape: [min, step, max, limit]
         decode_bs_limit = math.ceil(math.log2(max_num_seqs)) + 1
         decode_bs_bucket_cfg = [1, 2, max_num_seqs, decode_bs_limit]
-        max_decode_block_limit = math.ceil(math.log2(max_blocks)) + 1
-        decode_block_bucket_cfg = [block_size, block_size, max_blocks, max_decode_block_limit]
+        max_decode_block_limit = math.ceil(math.log2(num_max_blocks)) + 1
+        max_decode_blocks = min((max_model_len // block_size * max_num_seqs), num_max_blocks)
+        decode_block_bucket_cfg = [1, max_num_seqs, max_decode_blocks, max_decode_block_limit]
 
         msg = ("Decode bucket config (min, step, max_warmup, limit) "
                f"bs:{decode_bs_bucket_cfg}, "
@@ -163,54 +163,37 @@ def generate_prompt_buckets(bs_bucket_config,
 
 
 def generate_decode_buckets(bs_bucket_config, blocks_bucket_config,
-                            max_blocks, max_model_len, block_size, 
-                            skip_invalid=False):
+                            max_blocks, max_model_len, block_size):
     buckets = []
     bs_buckets = warmup_range_with_limit(bs_bucket_config)
-    tmp_blocks_bucket_config = blocks_bucket_config
-    tmp_blocks_bucket_config = (*tmp_blocks_bucket_config[:2], max_blocks, tmp_blocks_bucket_config[-1])
-    block_buckets = warmup_range_with_limit(tmp_blocks_bucket_config)
-    last_bucket = max_blocks
+    block_buckets = warmup_range_with_limit(blocks_bucket_config)
     valid_blocks = set()
-    if not skip_invalid:
-        #NOTE(kzawora): this case will generate all possible combinations of
-        # exponentially-spaced bs and blocks, even if combination is
-        # invalid (exceeds max_model_len). Unfortunately, this is necessary 
-        # to handle scenario where bucket dimensions are determined by 
-        # get_padded_decode_num_blocks or get_padded_decode_batch_size, 
-        # since they don't include information about the other dimension. 
-        # This will need to be refactored at some point in the model runner,
-        # but for now, we are dealing with this.
-        valid_blocks = set((bs, 1, x) for x in sorted(block_buckets) for bs in bs_buckets)
-    else:
-        #NOTE(kzawora): this case will generate only valid combinations of
-        # exponentially-spaced bs and blocks, where the product of bs and blocks
-        # is less than or equal to max_model_len. To handle corner cases 
-        # (e.g. longer context due to fragmentation), we're adding an additional
-        # bucket with max_blocks for each batch size.
-        # For this to work properly, bucket dimensions need be requested as 
-        # a combination of (batch_size, num_blocks), not separately.
-        for bs in bs_buckets:
-            max_blocks_per_bs = min(bs * math.ceil(max_model_len / block_size), last_bucket)
-            upper_bucket_bound = next(x for x in sorted(block_buckets) if x >= max_blocks_per_bs)
-            valid_blocks = set((bs, 1, x) for x in sorted(block_buckets) if x <= upper_bucket_bound)
-            
-    buckets.extend(list(valid_blocks))
-    return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
-
-
-def warmup_range_with_limit(config: Tuple[int, int, int, int], long_context=False, fill=True):
+    #NOTE(kzawora): generate only valid combinations of
+    # exponentially-spaced bs and blocks, where the product of bs and blocks
+    # is less than or equal to max_model_len. To handle corner cases 
+    # (e.g. longer context due to fragmentation), we're adding an additional
+    # bucket with max_blocks for each batch size.
+    # For this to work properly, bucket dimensions need be requested as 
+    # a combination of (batch_size, num_blocks), not separately.
+    for bs in bs_buckets:
+        max_blocks_per_bs = min(bs * math.ceil(max_model_len / block_size), max_blocks)
+        try:
+            upper_bucket_bound = max(x for x in sorted(block_buckets) if x <= max_blocks_per_bs)
+        except ValueError:
+            continue
+        valid_blocks = set((bs, 1, x) for x in sorted(block_buckets) if x <= upper_bucket_bound \
+                          and bs <= x)
+        buckets.extend(valid_blocks)
+    return list(buckets)
+
+
+def warmup_range_with_limit(config: Tuple[int, int, int, int], long_context=False):
     """ 
     NOTE(kzawora): we'll use exponential spacing for buckets in which scaled 
     power will return bmin for first bucket iteration, and bmax for last 
     iteration, with elements between determined by the exponent, and base being 
-    unchanged. Note that after padding to bstep, duplicates may occur. 
-    Handling of duplicates is configured by fill parameter. 
-    If fill is False, duplicates are removed and less buckets are returned. 
-    
-    If fill is True, duplicates are resolved by selecting the closest (larger 
-    or smaller) bucket. If duplicate resolution is not possible, less buckets 
-    are returned. In that case, buckets are guaranteed to be linearly spaced.
+    unchanged. Note that after padding to bstep, duplicates may occur, and
+    then shall be removed.
     Example (bmin=128, bstep=128, bmax=2048, num_buckets=10):
     There are 16 possible buckets (2048/128), and we'll attempt to select 10 of 
     them with exponential spacing.
@@ -226,37 +209,13 @@ def warmup_range_with_limit(config: Tuple[int, int, int, int], long_context=Fals
     scaled_powers_unpadded     = [bmin*base^0(==bmin), bmin*base^1, bmin*base^2,       ...,     bmin*base^9(==bmax)]
     scaled_powers_unpadded     = [128.00, 174.18, 237.02, 322.54, 438.91, 597.26, 812.75, 1105.98, 1505.01, 2048.00]
  
-    if fill is False:
+    We then remove duplicate buckets:
         scaled_powers_padded   = [   128,    256,    256,    384,    512,    640,    896,    1152,    1536,    2048]
                                                ^_______^ 
                                                duplicates
         buckets                = [   128,    256,            384,    512,    640,    896,    1152,    1536,    2048]
                                                       ^ 
                                          duplicate bucket removed
-        len(buckets) = 9, num_buckets = 10
-    if fill is True:
-        buckets                = [   128,    256,    384,    512,    640,    768,    896,    1152,    1536,    2048]
-                                                      ^_______^_______^_______^ 
-                                                   closest unused buckets selected
-                                                              ^_______^_______^ 
-                                      these become duplicates once previous duplicates are resolved
-        
-        In this case we'll have four duplicated buckets:
-        174.18 -> 256, optimal bucket,
-        237.02 -> (256) -> 384, taking closest available bucket, 
-            as optimal bucket 256 was already captured by 174.18, 
-        322.54 -> (384) -> 512, taking closest available bucket, 
-            as optimal bucket 384 was already captured by 237.02,
-        438.91 -> (512) -> 640, taking closest available bucket, 
-            as optimal bucket 512 was already captured by 322.54,
-        597.26 -> (640) -> 768, taking closest available bucket, 
-            as optimal bucket 640 was already captured by 438.91,
-        812.75 -> 896, optimal bucket
-        len(buckets) = 10, num_buckets = 10
-        In this case, the end result has the same buckets as fill=False, 
-        but with additional bucket 768 added. 
-        The difference is more pronounced for larger ranges and larger number 
-        of buckets.
     """ # noqa: E501
 
     bmin, bstep, bmax, num_buckets = config
@@ -281,15 +240,7 @@ def warmup_range_with_limit(config: Tuple[int, int, int, int], long_context=Fals
             bucket = bmax
         else:
             bucket = math.ceil(power_unpadded / bstep) * bstep
-        if fill and bucket in buckets:
-            available_buckets = linear_buckets.difference(buckets)
-            if len(available_buckets) == 0:
-                break  # there are no more unique buckets, let's exit now
-            new_bucket = min(available_buckets,
-                             key=lambda x: abs(x - power_unpadded))
-            buckets.add(new_bucket)
-        else:
-            buckets.add(bucket)
+        buckets.add(bucket)
 
     if long_context:
         #tmp_step = bmax / num_buckets