vllm-project
diff --git a/‎tests/lora/conftest.py
Lines changed: 0 additions & 5 deletions b/‎tests/lora/conftest.py
Lines changed: 0 additions & 5 deletions
diff --git a/‎tests/lora/test_peft_helper.py
Lines changed: 4 additions & 7 deletions b/‎tests/lora/test_peft_helper.py
Lines changed: 4 additions & 7 deletions
diff --git a/‎vllm/config.py
Lines changed: 1 addition & 13 deletions b/‎vllm/config.py
Lines changed: 1 addition & 13 deletions
diff --git a/‎vllm/engine/arg_utils.py
Lines changed: 0 additions & 5 deletions b/‎vllm/engine/arg_utils.py
Lines changed: 0 additions & 5 deletions
diff --git a/‎vllm/lora/layers.py
Lines changed: 0 additions & 90 deletions b/‎vllm/lora/layers.py
Lines changed: 0 additions & 90 deletions
@@ -221,11 +221,6 @@ def phi2_lora_files():
     return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
 
 
-@pytest.fixture(scope="session")
-def long_context_lora_files_16k_1():
-    return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")
-
-
 @pytest.fixture
 def llama_2_7b_engine_extra_embeddings():
     cleanup_dist_env_and_memory(shutdown_ray=True)
 
@@ -38,8 +38,8 @@
 ]
 
 
-def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
-    peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1,
+def test_peft_helper_pass(sql_lora_files, tmp_path):
+    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
                                             max_position_embeddings=4096)
     lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
     peft_helper.validate_legal(lora_config)
@@ -56,15 +56,12 @@ def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
         "embed_tokens",
         "lm_head",
     ]
-    assert peft_helper.context_length == 16384
     assert peft_helper.vllm_max_position_embeddings == 4096
-    assert peft_helper.vllm_long_context_scaling_factor == float(
-        math.ceil(peft_helper.context_length /
-                  peft_helper.vllm_max_position_embeddings))
+
     # test RSLoRA
     rslora_config = dict(use_rslora=True)
     test_dir = tmp_path / "test_rslora"
-    shutil.copytree(long_context_lora_files_16k_1, test_dir)
+    shutil.copytree(sql_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
 
@@ -3014,12 +3014,7 @@ class LoRAConfig:
     (added to the base model vocabulary)."""
     lora_vocab_padding_size: ClassVar[int] = current_platform\
         .get_lora_vocab_padding_size()
-    long_lora_scaling_factors: Optional[tuple[float, ...]] = None
-    """Specify multiple scaling factors (which can be different from base model
-    scaling factor - see eg. Long LoRA) to allow for multiple LoRA adapters
-    trained with those scaling factors to be used at the same time. If not
-    specified, only adapters trained with the base model scaling factor are
-    allowed."""
+
     default_mm_loras: Optional[dict[str, str]] = None
     """Dictionary mapping specific modalities to LoRA model paths; this field
     is only applicable to multimodal models and should be leveraged when a
@@ -3052,7 +3047,6 @@ def compute_hash(self) -> str:
         factors.append(self.lora_dtype)
         factors.append(self.lora_extra_vocab_size)
         factors.append(self.lora_vocab_padding_size)
-        factors.append(self.long_lora_scaling_factors)
         factors.append(self.bias_enabled)
         hash_str = hashlib.md5(str(factors).encode(),
                                usedforsecurity=False).hexdigest()
@@ -3091,11 +3085,6 @@ def verify_with_model_config(self, model_config: ModelConfig):
         elif isinstance(self.lora_dtype, str):
             self.lora_dtype = getattr(torch, self.lora_dtype)
 
-    def verify_lora_support(self):
-        if self.long_lora_scaling_factors is not None and envs.VLLM_USE_V1:
-            raise ValueError(
-                "V1 LoRA does not support long LoRA, please use V0.")
-
 
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
@@ -4564,7 +4553,6 @@ def __post_init__(self):
         if self.lora_config is not None:
             self.lora_config.verify_with_cache_config(self.cache_config)
             self.lora_config.verify_with_model_config(self.model_config)
-            self.lora_config.verify_lora_support()
         if self.prompt_adapter_config is not None:
             self.prompt_adapter_config.verify_with_model_config(
                 self.model_config)
 
@@ -358,8 +358,6 @@ class EngineArgs:
     max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
     lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
     lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
-    long_lora_scaling_factors: Optional[tuple[float, ...]] = \
-        LoRAConfig.long_lora_scaling_factors
     # PromptAdapter fields
     enable_prompt_adapter: bool = False
     max_prompt_adapters: int = PromptAdapterConfig.max_prompt_adapters
@@ -723,8 +721,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--lora-dtype",
             **lora_kwargs["lora_dtype"],
         )
-        lora_group.add_argument("--long-lora-scaling-factors",
-                                **lora_kwargs["long_lora_scaling_factors"])
         lora_group.add_argument("--max-cpu-loras",
                                 **lora_kwargs["max_cpu_loras"])
         lora_group.add_argument("--fully-sharded-loras",
@@ -1245,7 +1241,6 @@ def create_engine_config(
             default_mm_loras=self.default_mm_loras,
             fully_sharded_loras=self.fully_sharded_loras,
             lora_extra_vocab_size=self.lora_extra_vocab_size,
-            long_lora_scaling_factors=self.long_lora_scaling_factors,
             lora_dtype=self.lora_dtype,
             max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
             and self.max_cpu_loras > 0 else None) if self.enable_lora else None
 
@@ -28,8 +28,6 @@
                                                RowParallelLinear)
 # yapf: enable
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.rotary_embedding import (
-    LinearScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.platforms import current_platform
@@ -1193,91 +1191,3 @@ def can_replace_layer(
     ) -> bool:
         # Special handling for the LogitsProcessor.
         return False
-
-
-class LinearScalingRotaryEmbeddingWithLoRA(BaseLayerWithLoRA):
-    """Implements RoPE-scaled embeddings with linear scaling for
-    multiple LoRA adapters with a specialized kernel.
-
-    Replace LinearScalingRotaryEmbedding with MultiLinearScalingRotaryEmbedding
-    which can handle multi lora adapters in a specialized kernel.
-    """
-
-    def __init__(self, base_layer: RotaryEmbedding) -> None:
-        super().__init__()
-        self.base_layer = base_layer
-
-    @property
-    def scaling_factors(self):
-        return self.base_layer.scaling_factors
-
-    @property
-    def rotary_dim(self):
-        return self.base_layer.rotary_dim
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        scaling_factors = (list(lora_config.long_lora_scaling_factors)
-                           if lora_config.long_lora_scaling_factors else [])
-        base_scaling_factor = (self.base_layer.scaling_factor if isinstance(
-            self.base_layer, LinearScalingRotaryEmbedding) else 1.0)
-        scaling_factors = sorted(
-            list(set([base_scaling_factor] + scaling_factors)))
-        self.base_layer = LinearScalingRotaryEmbedding(
-            self.base_layer.head_size,
-            self.base_layer.rotary_dim,
-            self.base_layer.max_position_embeddings,
-            self.base_layer.base,
-            self.base_layer.is_neox_style,
-            scaling_factors,
-            self.base_layer.dtype,
-        )
-
-    def reset_lora(self, index: int):
-        ...
-
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        ...
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        return self.base_layer(
-            positions,
-            query,
-            key,
-            offsets=self.punica_wrapper.long_lora_indices,
-        )
-
-    @property
-    def scaling_factor_to_offset(self) -> dict[float, int]:
-        return self.base_layer.scaling_factor_to_offset
-
-    @classmethod
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        """Returns True if the layer can be replaced by this LoRA layer."""
-        return (type(source_layer) is LinearScalingRotaryEmbedding
-                or type(source_layer) is RotaryEmbedding)
-
-    def extra_repr(self) -> str:
-        return self.base_layer.extra_repr()