EmbeddedLLM
diff --git a/‎tests/tensorizer_loader/test_tensorizer.py‎
Lines changed: 0 additions & 17 deletions b/‎tests/tensorizer_loader/test_tensorizer.py‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎tests/v1/worker/test_gpu_model_runner.py‎
Lines changed: 18 additions & 0 deletions b/‎tests/v1/worker/test_gpu_model_runner.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎vllm/model_executor/model_loader/base_loader.py‎
Lines changed: 20 additions & 2 deletions b/‎vllm/model_executor/model_loader/base_loader.py‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎vllm/model_executor/model_loader/bitsandbytes_loader.py‎
Lines changed: 2 additions & 16 deletions b/‎vllm/model_executor/model_loader/bitsandbytes_loader.py‎
Lines changed: 2 additions & 16 deletions
diff --git a/‎vllm/model_executor/model_loader/default_loader.py‎
Lines changed: 18 additions & 32 deletions b/‎vllm/model_executor/model_loader/default_loader.py‎
Lines changed: 18 additions & 32 deletions
diff --git a/‎vllm/model_executor/model_loader/dummy_loader.py‎
Lines changed: 6 additions & 17 deletions b/‎vllm/model_executor/model_loader/dummy_loader.py‎
Lines changed: 6 additions & 17 deletions
diff --git a/‎vllm/model_executor/model_loader/gguf_loader.py‎
Lines changed: 8 additions & 2 deletions b/‎vllm/model_executor/model_loader/gguf_loader.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎vllm/model_executor/model_loader/runai_streamer_loader.py‎
Lines changed: 9 additions & 21 deletions b/‎vllm/model_executor/model_loader/runai_streamer_loader.py‎
Lines changed: 9 additions & 21 deletions
@@ -4,7 +4,6 @@
 import os
 import pathlib
 import subprocess
-from unittest.mock import MagicMock, patch
 
 import pytest
 import torch
@@ -16,7 +15,6 @@
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                          TensorSerializer,
                                                          is_vllm_tensorized,
-                                                         load_with_tensorizer,
                                                          open_stream,
                                                          tensorize_vllm_model)
 # yapf: enable
@@ -61,21 +59,6 @@ def write_keyfile(keyfile_path: str):
         f.write(encryption_params.key)
 
 
-@patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
-def test_load_with_tensorizer(mock_agent, tensorizer_config):
-    mock_linear_method = MagicMock()
-    mock_agent_instance = mock_agent.return_value
-    mock_agent_instance.deserialize.return_value = MagicMock()
-
-    result = load_with_tensorizer(tensorizer_config,
-                                  quant_method=mock_linear_method)
-
-    mock_agent.assert_called_once_with(tensorizer_config,
-                                       quant_method=mock_linear_method)
-    mock_agent_instance.deserialize.assert_called_once()
-    assert result == mock_agent_instance.deserialize.return_value
-
-
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_can_deserialize_s3(vllm_runner):
     model_ref = "EleutherAI/pythia-1.4b"
 
@@ -94,6 +94,9 @@ def model_runner():
     return runner
 
 
+model_runner_2 = model_runner
+
+
 def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
     new_reqs = []
     num_scheduled_tokens = {}
@@ -366,3 +369,18 @@ def rnd_stride_order():
         assert all(kv.is_contiguous() for kv in model_runner.kv_caches)
     else:
         assert all(not kv.is_contiguous() for kv in model_runner.kv_caches)
+
+
+def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
+    # In this test, model_runner loads model + weights in one go, while
+    # model_runner_2 loads dummy weights first then load real weights inplace
+    model_runner.load_model()
+    original_load_format = model_runner_2.load_config.load_format
+    model_runner_2.load_config.load_format = "dummy"
+    model_runner_2.load_model()  # Initial model loading with dummy weights
+    assert str(model_runner.get_model().state_dict()) != str(
+        model_runner_2.get_model().state_dict())
+    model_runner_2.load_config.load_format = original_load_format
+    model_runner_2.load_model()  # Load real weights inplace
+    assert str(model_runner.get_model().state_dict()) == str(
+        model_runner_2.get_model().state_dict())
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from abc import ABC, abstractmethod
 
+import torch
 import torch.nn as nn
 
 from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.model_executor.model_loader.utils import (
+    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 
 
 class BaseModelLoader(ABC):
@@ -18,7 +21,22 @@ def download_model(self, model_config: ModelConfig) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def load_model(self, *, vllm_config: VllmConfig,
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        """Load weights into a model. This standalone API allows 
+        inplace weights loading for an already-initialized model"""
+        raise NotImplementedError
+
+    def load_model(self, vllm_config: VllmConfig,
                    model_config: ModelConfig) -> nn.Module:
         """Load a model with the given configurations."""
-        raise NotImplementedError
+        device_config = vllm_config.device_config
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = initialize_model(vllm_config=vllm_config,
+                                         model_config=model_config)
+            # Quantization does not happen in `load_weights` but after it
+            self.load_weights(model, model_config)
+            process_weights_after_loading(model, model_config, target_device)
+        return model.eval()
@@ -14,7 +14,7 @@
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 # yapf: enable
@@ -28,7 +28,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.utils import (ParamMapping,
-                                                    initialize_model,
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
@@ -408,8 +407,7 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None:
                 ), "vllm currently does not support BNB quantization for"
         f" {type(model).__name__}"
 
-    def _load_weights(self, model_config: ModelConfig,
-                      model: nn.Module) -> None:
+    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         if not hasattr(model, "load_weights"):
             raise AttributeError(
                 "The required method 'load_weights' is not defined in class"
@@ -568,15 +566,3 @@ def _load_weights(self, model_config: ModelConfig,
 
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
-
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        device_config = vllm_config.device_config
-        with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
-
-                model = initialize_model(vllm_config=vllm_config)
-
-                self._load_weights(model_config, model)
-
-        return model.eval()
@@ -12,11 +12,9 @@
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
 from vllm import envs
-from vllm.config import LoadConfig, LoadFormat, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, LoadFormat, ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     fastsafetensors_weights_iterator, filter_duplicate_safetensors_files,
@@ -264,32 +262,20 @@ def download_model(self, model_config: ModelConfig) -> None:
                               fall_back_to_pt=True,
                               allow_patterns_overrides=None)
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        device_config = vllm_config.device_config
-        target_device = torch.device(device_config.device)
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config,
-                                         model_config=model_config)
-
-            weights_to_load = {name for name, _ in model.named_parameters()}
-            loaded_weights = model.load_weights(
-                self.get_all_weights(model_config, model))
-            self.counter_after_loading_weights = time.perf_counter()
-            logger.info(
-                "Loading weights took %.2f seconds",
-                self.counter_after_loading_weights -
-                self.counter_before_loading_weights)
-            # We only enable strict check for non-quantized models
-            # that have loaded weights tracking currently.
-            if model_config.quantization is None and loaded_weights is not None:
-                weights_not_loaded = weights_to_load - loaded_weights
-                if weights_not_loaded:
-                    raise ValueError(
-                        "Following weights were not initialized from "
-                        f"checkpoint: {weights_not_loaded}")
-
-            process_weights_after_loading(model, model_config, target_device)
-
-        return model.eval()
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        weights_to_load = {name for name, _ in model.named_parameters()}
+        loaded_weights = model.load_weights(
+            self.get_all_weights(model_config, model))
+        self.counter_after_loading_weights = time.perf_counter()
+        logger.info(
+            "Loading weights took %.2f seconds",
+            self.counter_after_loading_weights -
+            self.counter_before_loading_weights)
+        # We only enable strict check for non-quantized models
+        # that have loaded weights tracking currently.
+        if model_config.quantization is None and loaded_weights is not None:
+            weights_not_loaded = weights_to_load - loaded_weights
+            if weights_not_loaded:
+                raise ValueError("Following weights were not initialized from "
+                                 f"checkpoint: {weights_not_loaded}")
@@ -1,11 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-import torch
 import torch.nn as nn
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     initialize_dummy_weights)
 
@@ -22,16 +19,8 @@ def __init__(self, load_config: LoadConfig):
     def download_model(self, model_config: ModelConfig) -> None:
         pass  # Nothing to download
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        device_config = vllm_config.device_config
-        target_device = torch.device(device_config.device)
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config)
-            # NOTE(woosuk): For accurate performance evaluation, we assign
-            # random values to the weights.
-            initialize_dummy_weights(model)
-
-            process_weights_after_loading(model, model_config, target_device)
-        return model.eval()
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        # NOTE(woosuk): For accurate performance evaluation, we assign
+        # random values to the weights.
+        initialize_dummy_weights(model)
@@ -92,6 +92,13 @@ def _get_weights_iterator(
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model)
 
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        local_model_path = self._prepare_weights(model_config.model)
+        gguf_weights_map = self._get_gguf_weights_map(model_config)
+        model.load_weights(
+            self._get_weights_iterator(local_model_path, gguf_weights_map))
+
     def load_model(self, vllm_config: VllmConfig,
                    model_config: ModelConfig) -> nn.Module:
         device_config = vllm_config.device_config
@@ -106,8 +113,7 @@ def load_model(self, vllm_config: VllmConfig,
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
                 model = initialize_model(vllm_config=vllm_config)
-            model.load_weights(
-                self._get_weights_iterator(local_model_path, gguf_weights_map))
+            self.load_weights(model, model_config)
 
             process_weights_after_loading(model, model_config, target_device)
         return model
@@ -9,10 +9,8 @@
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import LoadConfig, ModelConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
-from vllm.model_executor.model_loader.utils import (
-    initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     runai_safetensors_weights_iterator)
@@ -100,21 +98,11 @@ def download_model(self, model_config: ModelConfig) -> None:
         """Download model if necessary"""
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
-        """Perform streaming of the model to destination"""
-        device_config = vllm_config.device_config
-        target_device = torch.device(device_config.device)
-        with set_default_torch_dtype(model_config.dtype):
-            with target_device:
-                model = initialize_model(vllm_config=vllm_config)
-
-            model_weights = model_config.model
-            if hasattr(model_config, "model_weights"):
-                model_weights = model_config.model_weights
-            model.load_weights(
-                self._get_weights_iterator(model_weights,
-                                           model_config.revision))
-
-            process_weights_after_loading(model, model_config, target_device)
-        return model.eval()
+    def load_weights(self, model: nn.Module,
+                     model_config: ModelConfig) -> None:
+        """Load weights into a model."""
+        model_weights = model_config.model
+        if hasattr(model_config, "model_weights"):
+            model_weights = model_config.model_weights
+        model.load_weights(
+            self._get_weights_iterator(model_weights, model_config.revision))