huggingface
diff --git a/‎optimum/neuron/models/inference/backend/config.py‎
Lines changed: 0 additions & 4 deletions b/‎optimum/neuron/models/inference/backend/config.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎optimum/neuron/models/inference/backend/graph_builder.py‎
Lines changed: 0 additions & 9 deletions b/‎optimum/neuron/models/inference/backend/graph_builder.py‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎optimum/neuron/models/inference/backend/modules/attention/attention_base.py‎
Lines changed: 0 additions & 1 deletion b/‎optimum/neuron/models/inference/backend/modules/attention/attention_base.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎optimum/neuron/models/inference/backend/modules/autobucketing.py‎
Lines changed: 0 additions & 131 deletions b/‎optimum/neuron/models/inference/backend/modules/autobucketing.py‎
Lines changed: 0 additions & 131 deletions
diff --git a/‎optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py‎
Lines changed: 17 additions & 76 deletions b/‎optimum/neuron/models/inference/backend/modules/decoder/decoder_builder.py‎
Lines changed: 17 additions & 76 deletions
@@ -68,7 +68,6 @@ def __init__(
         max_context_length: int | None = None,
         output_logits: bool | None = False,
         fused_qkv: bool | None = False,
-        enable_bucketing: bool | None = False,
         target: str | None = None,  # set to "trn2" for trn2
         on_device_sampling: bool | None = False,
         max_topk: int | None = 256,
@@ -106,9 +105,6 @@ def __init__(
         self.on_device_sampling = on_device_sampling
         self.max_topk = max_topk
 
-        # Bucketing
-        self.enable_bucketing = enable_bucketing
-
         # Speculative decoding
         self.speculation_length = speculation_length
         if self.speculation_length > 0:
 
@@ -16,7 +16,6 @@
 
 import torch
 from neuronx_distributed.trace.model_builder import BaseModelInstance
-from torch_neuronx import BucketModelConfig
 
 
 class NxDGraphBuilder(ABC):
@@ -40,11 +39,3 @@ def get_model_instance(self) -> BaseModelInstance:
         Used at compilation time only when tracing the model.
         """
         raise NotImplementedError
-
-    @abstractmethod
-    def get_bucket_config(self) -> BucketModelConfig | None:
-        """Return the bucket configuration
-
-        Used at compilation time only when tracing the model.
-        """
-        raise NotImplementedError
@@ -266,7 +266,6 @@ def get_flash_attention_strategy(self, q_len) -> FlashAttentionStrategy:
         These constraints may change later.
 
         TODO: Throw an exception instead of disabling flash attention if explicitly enabled but not eligible.
-              This must consider bucketing to avoid throwing an exception for smaller buckets.
         """
         if self._qk_scale is not None:
             # If a custom qk_scale is provided, flash attention is not supported.
 
@@ -15,77 +15,29 @@
 
 import torch
 from neuronx_distributed.trace.model_builder import BaseModelInstance
-from torch_neuronx import BucketModelConfig
 from transformers import PretrainedConfig
 
 from ...config import NxDNeuronConfig
 from ...graph_builder import NxDGraphBuilder
-from ..autobucketing import (
-    get_context_encoder_bk,
-    get_generation_model_bk,
-)
 from ..generation.sampling import prepare_sampling_params
 
 
-CONTEXT_ENCODING_MODEL_TAG = "context_encoding_model"
-TOKEN_GENERATION_MODEL_TAG = "token_generation_model"
-SPECULATION_MODEL_TAG = "speculation_model"
-
-
-def get_bucket_model_config_from_tag(
-    tag, config: PretrainedConfig, neuron_config: NxDNeuronConfig, buckets: list[int]
-):
-    bucket_degree = len(buckets)
-    if bucket_degree == 1:
-        return None
-
-    pad_token = config.pad_token_id
-
-    # NOTE: KV Cache preprocessing is done within the model and not the
-    # shared buffer preprocessor due to lack of support of non-contiguous
-    # slicing of nrt tensors via the NRT API.
-    if tag == CONTEXT_ENCODING_MODEL_TAG:
-        return BucketModelConfig(
-            bucket_kernel=get_context_encoder_bk,
-            bucket_kernel_constant_args=(
-                torch.tensor(buckets),
-                pad_token,
-            ),
-            shared_state_buffer=None,
-            func_kwargs=[{"bucket_rank": i} for i in range(bucket_degree)],
-        )
-    elif tag == TOKEN_GENERATION_MODEL_TAG or tag == SPECULATION_MODEL_TAG:
-        return BucketModelConfig(
-            bucket_kernel=get_generation_model_bk,
-            bucket_kernel_constant_args=(
-                torch.tensor(buckets),
-                0,
-            ),
-            shared_state_buffer=None,
-            func_kwargs=[{"bucket_rank": i} for i in range(bucket_degree)],
-        )
-    else:
-        raise ValueError(
-            f"The supplied tag: {tag} is not supported for Bucketing. Only {CONTEXT_ENCODING_MODEL_TAG} and {TOKEN_GENERATION_MODEL_TAG} are supported"
-        )
-
-
 class NxDDecoderBuilder(NxDGraphBuilder):
     def __init__(
         self,
         config: PretrainedConfig,
         neuron_config: NxDNeuronConfig,
-        buckets: list[int],
-        bucket_n_active_tokens: bool,
+        max_tokens: int,
+        active_tokens: int,
         model_cls,
         tag="",
         priority_model_idx: int = None,
     ) -> None:
         super().__init__(tag, priority_model_idx)
         self.config = config
         self.neuron_config = neuron_config
-        self.buckets = buckets
-        self.bucket_n_active_tokens = bucket_n_active_tokens
+        self.max_tokens = max_tokens
+        self.active_tokens = active_tokens
 
         if not self.neuron_config.torch_dtype:
             self.neuron_config.torch_dtype = torch.float32
@@ -99,18 +51,16 @@ def input_generator(
         self,
     ):
         inputs = []
-        for bucket in self.buckets:
-            n_active_tokens = bucket if self.bucket_n_active_tokens else self.neuron_config.n_active_tokens
 
-            input_ids = torch.zeros((self.neuron_config.batch_size, n_active_tokens), dtype=torch.int32)
-            attention_mask = torch.zeros((self.neuron_config.batch_size, bucket), dtype=torch.int32)
-            position_ids = torch.zeros((self.neuron_config.batch_size, n_active_tokens), dtype=torch.int32)
-            seq_ids = torch.zeros((self.neuron_config.batch_size), dtype=torch.int32)
-            # Get the count of sampling params currently supported.
-            sampling_params_len = prepare_sampling_params(1).shape[1]
-            sampling_params = torch.zeros((self.neuron_config.batch_size, sampling_params_len), dtype=torch.float32)
+        input_ids = torch.zeros((self.neuron_config.batch_size, self.active_tokens), dtype=torch.int32)
+        attention_mask = torch.zeros((self.neuron_config.batch_size, self.max_tokens), dtype=torch.int32)
+        position_ids = torch.zeros((self.neuron_config.batch_size, self.active_tokens), dtype=torch.int32)
+        seq_ids = torch.zeros((self.neuron_config.batch_size), dtype=torch.int32)
+        # Get the count of sampling params currently supported.
+        sampling_params_len = prepare_sampling_params(1).shape[1]
+        sampling_params = torch.zeros((self.neuron_config.batch_size, sampling_params_len), dtype=torch.float32)
 
-            inputs.append((input_ids, attention_mask, position_ids, seq_ids, sampling_params))
+        inputs.append((input_ids, attention_mask, position_ids, seq_ids, sampling_params))
 
         return inputs
 
@@ -119,21 +69,18 @@ def get_model_instance(self):
             model_cls=self.model_cls,
             config=self.config,
             neuron_config=self.neuron_config,
-            buckets=self.buckets,
+            n_positions=self.max_tokens,
         )
 
-    def get_bucket_config(self):
-        return get_bucket_model_config_from_tag(self.tag, self.config, self.neuron_config, self.buckets)
-
 
 class DecoderModelInstance(BaseModelInstance):
-    def __init__(self, model_cls, config: PretrainedConfig, neuron_config: NxDNeuronConfig, buckets: list[int]):
+    def __init__(self, model_cls, config: PretrainedConfig, neuron_config: NxDNeuronConfig, n_positions: int):
         self.model_cls = model_cls
         self.module = None
         self.input_output_aliases = None
         self.config = config
         self.neuron_config = neuron_config
-        self.buckets = buckets
+        self.n_positions = n_positions
 
     def initialize_process_group(self, world_size):
         self.model_cls.initialize_process_group(world_size)
@@ -149,18 +96,12 @@ def load_module(self):
                 else t
             )
         self.module = float_model
+        self.module.n_positions = self.n_positions
 
     def get(self, bucket_rank, **kwargs):
-        if bucket_rank is not None:
-            self.module.n_positions = self.buckets[bucket_rank]
-
-        # Currently we have to init an input_output_aliases map for
-        # each buckets, otherwise it will fail the aliasing setup when
-        # generating HLO
+        assert bucket_rank == 0
         self.input_output_aliases = {}
         num_output_from_trace = 1 if not self.neuron_config.output_logits else 2
-        # TODO: This else block is a short-term fix for Llava/ViT models to use DecoderModelInstance.
-        #       Long-term, these models should use a different implementation of BaseModelInstance.
         if self.module.kv_mgr is not None:
             past_key_values = self.module.kv_mgr.past_key_values
         else: