Warmup fix - for non contiguous PA runs, don't take more context blocks than possible (#97)

adobrzyn · web-flow · commit c204d321e06a · 2025-08-25T14:33:18.000+02:00
Signed-off-by: Agata Dobrzyniewicz &lt;adobrzyniewicz@habana.ai&gt;
diff --git a/vllm_gaudi/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py
@@ -98,7 +98,7 @@ def generate_decode_buckets(self):
                             block_size = self.block_size, 
                             max_num_batched_tokens = self.max_num_batched_tokens,
                             max_model_len = self.max_model_len, 
-                            num_max_blocks = self.num_hpu_blocks)
+                            max_blocks = self.num_hpu_blocks)
             self.log_generate_info(False)
         else:
             logger().info("Bucketing is off - skipping decode buckets generation")
diff --git a/vllm_gaudi/extension/bucketing/exponential.py b/vllm_gaudi/extension/bucketing/exponential.py
@@ -62,15 +62,17 @@ def get_prompt_buckets(self, max_num_prefill_seqs, block_size,
 
     def get_decode_buckets(self, max_num_seqs, block_size, 
                            max_num_batched_tokens, max_model_len,
-                           num_max_blocks):
+                           max_blocks):
         self.check_for_user_flags('decode')
         prefix_caching = get_config().prefix_caching
+        use_contiguous_pa = get_config().use_contiguous_pa
 
         # cfgs shape: [min, step, max, limit]
         decode_bs_limit = math.ceil(math.log2(max_num_seqs)) + 1
         decode_bs_bucket_cfg = [1, 2, max_num_seqs, decode_bs_limit]
-        max_decode_block_limit = math.ceil(math.log2(num_max_blocks)) + 1
-        max_decode_blocks = min((max_model_len // block_size * max_num_seqs), num_max_blocks)
+        max_decode_block_limit = math.ceil(math.log2(max_blocks)) + 1
+        max_decode_blocks = max_blocks if use_contiguous_pa else \
+                            min((max_model_len // block_size * max_num_seqs), max_blocks)
         decode_block_bucket_cfg = [1, max_num_seqs, max_decode_blocks, max_decode_block_limit]
 
         msg = ("Decode bucket config (min, step, max_warmup, limit) "
@@ -80,7 +82,7 @@ def get_decode_buckets(self, max_num_seqs, block_size,
 
         decode_buckets = generate_decode_buckets(
             decode_bs_bucket_cfg, decode_block_bucket_cfg,
-            num_max_blocks, max_model_len, block_size)
+            max_blocks, max_model_len, block_size)
 
         return sorted(decode_buckets)
 
diff --git a/vllm_gaudi/extension/bucketing/linear.py b/vllm_gaudi/extension/bucketing/linear.py
@@ -15,14 +15,12 @@ def get_prompt_buckets(self, max_num_prefill_seqs, block_size,
         use_merged_prefill = get_config().merged_prefill
         prefix_caching = get_config().prefix_caching
 
-        max_prompt_seq = max_model_len
-
         prompt_bs_bucket_cfg = read_bucket_settings(
             'prompt', 'bs', min=1, step=32,
             max=max_num_prefill_seqs)
         prompt_seq_bucket_cfg = read_bucket_settings(
             'prompt', 'seq', min=block_size,
-            step=block_size, max=max_prompt_seq)
+            step=block_size, max=max_model_len)
 
         if use_merged_prefill:
             prev_prompt_bs_bucket_cfg = tuple(prompt_bs_bucket_cfg)
@@ -56,10 +54,8 @@ def get_prompt_buckets(self, max_num_prefill_seqs, block_size,
 
     def get_decode_buckets(self, max_num_seqs, block_size, 
                            max_num_batched_tokens, max_model_len, 
-                           num_max_blocks):
+                           max_blocks):
         prefix_caching = get_config().prefix_caching
-        
-        max_blocks = num_max_blocks
 
         decode_bs_bucket_cfg = read_bucket_settings(
             'decode', 'bs', min=1, step=32,
@@ -75,7 +71,7 @@ def get_decode_buckets(self, max_num_seqs, block_size,
 
         decode_buckets = generate_decode_buckets(
             decode_bs_bucket_cfg,
-            decode_block_bucket_cfg, num_max_blocks)
+            decode_block_bucket_cfg, max_blocks, max_model_len, block_size)
 
         return sorted(decode_buckets)
 
@@ -190,23 +186,24 @@ def generate_prompt_buckets(bs_bucket_config,
 
 
 def generate_decode_buckets(bs_bucket_config, blocks_bucket_config,
-                            max_blocks):
+                            max_blocks, max_model_len, block_size):
     buckets = []
     bs_buckets = warmup_range(bs_bucket_config)
     use_contiguous_pa = get_config().use_contiguous_pa
-    if os.environ.get('VLLM_DECODE_BLOCK_BUCKET_MAX') is None\
-       and use_contiguous_pa:
-        blocks_bucket_config[2] = max_blocks
     block_buckets = warmup_range(blocks_bucket_config)
-    if os.environ.get('VLLM_DECODE_BLOCK_BUCKET_MAX') is None\
-       and max_blocks not in block_buckets and use_contiguous_pa:
+    if max_blocks not in block_buckets and use_contiguous_pa:
         block_buckets.append(max_blocks)
     last_bucket = max_blocks
     for bs in bs_buckets:
+        max_blocks_including_max_model_len = bs * math.ceil(max_model_len / block_size) 
         for blocks in block_buckets:
             if bs > blocks:
                 # Skip a dummy case when bs > blocks, which cannot occur in real execution
                 continue
+            if not use_contiguous_pa and blocks > max_blocks_including_max_model_len:
+                # Skip case when user wants to have bigger blocks than max model len
+                # case cn only occur with contiguous PA
+                continue
             if blocks >= last_bucket:
                 buckets.append((bs, 1, last_bucket))
                 break