vllm-project
diff --git a/‎tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py‎
Lines changed: 3 additions & 5 deletions b/‎tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py‎
Lines changed: 3 additions & 5 deletions b/‎tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎vllm/model_executor/__init__.py‎
Lines changed: 0 additions & 2 deletions b/‎vllm/model_executor/__init__.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/logits_processor.py‎
Lines changed: 0 additions & 2 deletions b/‎vllm/model_executor/layers/logits_processor.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎vllm/model_executor/models/apertus.py‎
Lines changed: 1 addition & 4 deletions b/‎vllm/model_executor/models/apertus.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎vllm/model_executor/models/arcee.py‎
Lines changed: 3 additions & 4 deletions b/‎vllm/model_executor/models/arcee.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎vllm/model_executor/models/arctic.py‎
Lines changed: 1 addition & 4 deletions b/‎vllm/model_executor/models/arctic.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎vllm/model_executor/models/aria.py‎
Lines changed: 2 additions & 5 deletions b/‎vllm/model_executor/models/aria.py‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎vllm/model_executor/models/aya_vision.py‎
Lines changed: 1 addition & 4 deletions b/‎vllm/model_executor/models/aya_vision.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎vllm/model_executor/models/baichuan.py‎
Lines changed: 1 addition & 4 deletions b/‎vllm/model_executor/models/baichuan.py‎
Lines changed: 1 addition & 4 deletions
@@ -9,7 +9,6 @@
                                               LlavaForConditionalGeneration,
                                               LlavaMultiModalProcessor,
                                               LlavaProcessingInfo)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
@@ -18,11 +17,10 @@
                                         dummy_inputs=LlavaDummyInputsBuilder)
 class MyLlava(LlavaForConditionalGeneration):
 
-    def compute_logits(
-            self, hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+    def compute_logits(self,
+                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
         # this dummy model always predicts the first token
-        logits = super().compute_logits(hidden_states, sampling_metadata)
+        logits = super().compute_logits(hidden_states)
         if logits is not None:
             logits.zero_()
             logits[:, 0] += 1.0
 
@@ -6,16 +6,14 @@
 import torch
 
 from vllm.model_executor.models.opt import OPTForCausalLM
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 
 
 class MyOPTForCausalLM(OPTForCausalLM):
 
-    def compute_logits(
-            self, hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+    def compute_logits(self,
+                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
         # this dummy model always predicts the first token
-        logits = super().compute_logits(hidden_states, sampling_metadata)
+        logits = super().compute_logits(hidden_states)
         if logits is not None:
             logits.zero_()
             logits[:, 0] += 1.0
 
@@ -3,11 +3,9 @@
 
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            PackedvLLMParameter)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 
 __all__ = [
-    "SamplingMetadata",
     "set_random_seed",
     "BasevLLMParameter",
     "PackedvLLMParameter",
 
@@ -10,7 +10,6 @@
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.platforms import current_platform
 
 
@@ -50,7 +49,6 @@ def forward(
         self,
         lm_head: VocabParallelEmbedding,
         hidden_states: torch.Tensor,
-        sampling_metadata: Optional[SamplingMetadata] = None,
         embedding_bias: Optional[torch.Tensor] = None,
     ) -> Optional[torch.Tensor]:
         if self.logits_as_input:
 
@@ -48,7 +48,6 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -566,10 +565,8 @@ def forward(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
+        logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str,
 
@@ -399,11 +399,10 @@ def forward(
                                   inputs_embeds=inputs_embeds)
         return model_output
 
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata) -> Optional[torch.Tensor]:
+    def compute_logits(self,
+                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
         # Compute final logits from hidden states (last pipeline rank only)
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
+        logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
 
@@ -30,7 +30,6 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -456,10 +455,8 @@ def forward(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
+        logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str,
 
@@ -19,7 +19,6 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargsItems)
@@ -644,10 +643,8 @@ def forward(
 
         return hidden_states
 
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
 
@@ -16,7 +16,6 @@
     get_optimal_tiled_canvas)
 
 from vllm.config import VllmConfig
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
@@ -464,7 +463,5 @@ def forward(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        return self.language_model.compute_logits(hidden_states,
-                                                  sampling_metadata)
+        return self.language_model.compute_logits(hidden_states)
@@ -46,7 +46,6 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, row_parallel_weight_loader)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
@@ -421,10 +420,8 @@ def forward(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
+        logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str,