diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7e5c3f30c..8eaf84f3d 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -17,7 +17,7 @@ Please delete options that are not relevant. ## Checklist: - [ ] Please add the link of [**Integration Tests Executor** run](https://github.com/deepjavalibrary/djl-serving/actions/workflows/integration_execute.yml) with related tests. - [ ] Have you [manually built the docker image](https://github.com/deepjavalibrary/djl-serving/blob/master/serving/docker/README.md#build-docker-image) and verify the change? -- [ ] Have you run related tests? Check [how to set up the test environment here](https://github.com/deepjavalibrary/djl-serving/blob/master/.github/workflows/integration_execute.yml#L72); One example would be `pytest tests.py -k "TestCorrectnessLmiDist" -m "lmi_dist"` +- [ ] Have you run related tests? Check [how to set up the test environment here](https://github.com/deepjavalibrary/djl-serving/blob/master/.github/workflows/integration_execute.yml#L98); One example would be `pytest tests.py -k "TestVllm1" -m "vllm"` - [ ] Have you added tests that prove your fix is effective or that this feature works? - [ ] Has code been commented, particularly in hard-to-understand areas? - [ ] Have you made corresponding changes to the documentation? diff --git a/.github/workflows/llm_integration_p4d.yml b/.github/workflows/llm_integration_p4d.yml index 420b10bbf..5dbd1b6e8 100644 --- a/.github/workflows/llm_integration_p4d.yml +++ b/.github/workflows/llm_integration_p4d.yml @@ -29,83 +29,6 @@ jobs: outputs: p4d_instance_id: ${{ steps.create_gpu_p4d.outputs.action_lmic_p4d_instance_id }} - lmi-dist-test: - if: contains(fromJson('["", "aiccl"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, p4d ] - timeout-minutes: 120 - needs: create-runners-p4d - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install pytest requests "numpy<2" pillow huggingface_hub tqdm - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test Mixtral-8x7B - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist_aiccl mixtral-8x7b-aiccl - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_aiccl mixtral-8x7b-aiccl - ./remove_container.sh - - name: Test Llama-2-70B - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist_aiccl llama-2-70b-aiccl - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_aiccl llama-2-70b-aiccl - ./remove_container.sh - - name: Test codellama/CodeLlama-34b-hf - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist_aiccl codellama-34b-aiccl - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_aiccl codellama-34b-aiccl - ./remove_container.sh - - name: Test tiiuae/falcon-40b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist_aiccl falcon-40b-aiccl - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_aiccl falcon-40b-aiccl - ./remove_container.sh - - name: Remove models dir - working-directory: tests/integration - run: | - sudo rm -rf models - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - sudo rm -rf models - ./remove_container.sh || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v4 - with: - name: lmi-dist-aiccl-logs - path: tests/integration/logs/ trtllm-test: runs-on: [ self-hosted, p4d ] @@ -228,7 +151,7 @@ jobs: stop-runners-p4d: if: always() runs-on: [ self-hosted, scheduler ] - needs: [ create-runners-p4d, lmi-dist-test, trtllm-test, vllm-test ] + needs: [ create-runners-p4d, trtllm-test, vllm-test ] steps: - name: Stop all instances run: | diff --git a/.github/workflows/lmi-dist-deps-build.yml b/.github/workflows/lmi-dist-deps-build.yml deleted file mode 100644 index 9620cbbb1..000000000 --- a/.github/workflows/lmi-dist-deps-build.yml +++ /dev/null @@ -1,115 +0,0 @@ -name: LMI-Dist dependency build - -on: - workflow_dispatch: - -permissions: - id-token: write - contents: read - -jobs: - create-runners-p4d: - runs-on: [ self-hosted, scheduler ] - steps: - - name: Create new P4d.24xl instance - id: create_gpu_p4d - run: | - cd /home/ubuntu/djl_benchmark_script/scripts - token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ - https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ - --fail \ - | jq '.token' | tr -d '"' ) - ./start_instance.sh action_lmic_p4d $token djl-serving - outputs: - p4d_instance_id: ${{ steps.create_gpu_p4d.outputs.action_lmic_p4d_instance_id }} - - lmi-deps-build: - runs-on: - - self-hosted - - p4d - - RUN_ID-${{ github.run_id }} - - RUN_NUMBER-${{ github.run_number }} - - SHA-${{ github.sha }} - container: - image: nvidia/cuda:12.4.1-devel-ubuntu22.04 - options: --gpus all --runtime=nvidia --shm-size 20g - timeout-minutes: 90 - needs: create-runners-p4d - steps: - - uses: actions/checkout@v4 - - name: Setup Environment - run: | - apt-get update - apt-get install -y software-properties-common wget libaio-dev g++ git gcc - mkdir build_artifacts - - name: Set up Python3 - run: | - ./serving/docker/scripts/install_python.sh 3.10 - - name: Install torch dependencies - run: | - python -m venv venv - . ./venv/bin/activate - python -m pip install --upgrade pip - python -m pip install "numpy<2" cmake awscli packaging wheel setuptools ninja git-remote-codecommit \ - torch==2.3.1 --extra-index-url https://download.pytorch.org/whl/cu121 - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving - aws-region: us-east-1 - - name: Build FlashAttn V2 - run: | - . ./venv/bin/activate - export FLASH_ATTENTION_FORCE_BUILD=TRUE - git clone https://github.com/ymwangg/flash-attention flash-attention-v2 -b specdec_v0.4.2 - cd flash-attention-v2 - pip wheel . --no-deps - cp flash_attn-*.whl ../build_artifacts - - name: Build vllm 0.5.3.post1 Hanging Fix - run: | - . ./venv/bin/activate - git clone https://github.com/davidthomas426/vllm -b lmi_v11 - cd vllm - export TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0+PTX" - export VLLM_INSTALL_PUNICA_KERNELS=1 - pip wheel . --no-deps - cp vllm-*.whl ../build_artifacts - - name: Upload artifacts - uses: actions/upload-artifact@v3 - with: - name: build-artifacts - path: build_artifacts/ - - lmi-deps-upload: - runs-on: - - self-hosted - - p4d - - RUN_ID-${{ github.run_id }} - - RUN_NUMBER-${{ github.run_number }} - - SHA-${{ github.sha }} - needs: lmi-deps-build - steps: - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: Install dependencies - run: pip install awscli - - name: Download built-artifacts - uses: actions/download-artifact@v3 - with: - name: build-artifacts - - name: upload to S3 - run: | - aws s3 cp vllm*.whl s3://djl-ai-staging/publish/vllm/cu124-pt231/ - - stop-runners-p4d: - if: always() - runs-on: [ self-hosted, scheduler ] - needs: [ create-runners-p4d, lmi-deps-build, lmi-deps-upload ] - steps: - - name: Stop all instances - run: | - cd /home/ubuntu/djl_benchmark_script/scripts - instance_id=${{ needs.create-runners-p4d.outputs.p4d_instance_id }} - ./stop_instance.sh $instance_id diff --git a/.github/workflows/sagemaker_llm_benchmark.yml b/.github/workflows/sagemaker_llm_benchmark.yml index 161983eb3..59140f6a0 100644 --- a/.github/workflows/sagemaker_llm_benchmark.yml +++ b/.github/workflows/sagemaker_llm_benchmark.yml @@ -47,7 +47,7 @@ jobs: strategy: fail-fast: false matrix: - engine: [lmi-dist, trtllm] + engine: [trtllm] steps: - uses: actions/checkout@v4 - name: Set up Python3 diff --git a/engines/python/setup/djl_python/chat_completions/chat_utils.py b/engines/python/setup/djl_python/chat_completions/chat_utils.py index 2b9589bdb..221c20dfc 100644 --- a/engines/python/setup/djl_python/chat_completions/chat_utils.py +++ b/engines/python/setup/djl_python/chat_completions/chat_utils.py @@ -23,7 +23,7 @@ def is_chat_completions_request(inputs: Dict) -> bool: def parse_mistral_chat_request_inputs(messages, tokenizer): # TODO: get rid of this mess of an integration # Mistral has their own tokenizer with custom tokenization logic for chat type requests - # This dependency is only available in vllm/lmi-dist, so we import it here as necessary + # This dependency is only available in vllm, so we import it here as necessary from mistral_common.protocol.instruct.request import ChatCompletionRequest chat_request = ChatCompletionRequest(messages=messages) # The tokenized object contains the converted prompt, token ids, and images @@ -76,8 +76,8 @@ def parse_chat_completions_request( images.extend(message.get_images()) # Less than ideal, but need a working solution for now - # is_mistral_tokenizer can only be true if lmi-dist or vllm - # mistral tokenization only works with these engines if we pass token ids directly, not text. + # is_mistral_tokenizer can only be true if vllm + # mistral tokenization only works with this engine if we pass token ids directly, not text. # every other use case is designed for the actual string prompt being provided... if is_mistral_tokenizer: text_inputs = parse_mistral_chat_request_inputs(messages, tokenizer) diff --git a/engines/python/setup/djl_python/neuron_utils/model_loader.py b/engines/python/setup/djl_python/neuron_utils/model_loader.py index a5a64ba54..aa78cfd87 100644 --- a/engines/python/setup/djl_python/neuron_utils/model_loader.py +++ b/engines/python/setup/djl_python/neuron_utils/model_loader.py @@ -166,7 +166,7 @@ def set_adapter_class(self): def can_use_continuous_batching(self) -> bool: """ Set configuration for continuous batching, currently all vllm implementations are continuous batching - and batch size greater than 1 for tnx and lmi-dist support rolling batch. + and batch size greater than 1 for tnx support rolling batch. :return: bool indicating if continuous batching can be used """ diff --git a/engines/python/setup/djl_python/properties_manager/hf_properties.py b/engines/python/setup/djl_python/properties_manager/hf_properties.py index 9f2359347..2b1bde2cf 100644 --- a/engines/python/setup/djl_python/properties_manager/hf_properties.py +++ b/engines/python/setup/djl_python/properties_manager/hf_properties.py @@ -64,9 +64,9 @@ def set_quantize_for_backward_compatibility(self): self.quantize = "bitsandbytes8" # TODO remove this after refactor of all handlers - # parsing bitsandbytes8, so it can be directly passed to lmi dist model loader. + # parsing bitsandbytes8, so it can be directly passed to vllm model loader. if self.quantize == "bitsandbytes8" \ - and self.rolling_batch == RollingBatchEnum.lmidist: + and self.rolling_batch == RollingBatchEnum.vllm: self.quantize = "bitsandbytes" return self @@ -123,9 +123,8 @@ def construct_kwargs_quantize(self): return self # TODO remove this after refactor of all handlers - # device map is not required for lmi dist and vllm + # device map is not required for vllm if self.rolling_batch in { - RollingBatchEnum.lmidist, RollingBatchEnum.vllm, }: return self diff --git a/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py b/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py deleted file mode 100644 index b8ec3cc00..000000000 --- a/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file -# except in compliance with the License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" -# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for -# the specific language governing permissions and limitations under the License. -import ast -from enum import Enum -from typing import Optional, Mapping, Tuple, Dict - -from pydantic import model_validator, field_validator - -from djl_python.properties_manager.properties import Properties - - -class LmiDistLoadFormats(str, Enum): - sagemaker_fast_model_loader = 'sagemaker_fast_model_loader' - - -class LmiDistRbProperties(Properties): - engine: Optional[str] = None - dtype: Optional[str] = "auto" - load_format: Optional[str] = "auto" - quantize: Optional[str] = None - tensor_parallel_degree: int = 1 - pipeline_parallel_degree: int = 1 - max_rolling_batch_prefill_tokens: Optional[int] = None - # Adjustable prefix model length for certain 32k or longer model - max_model_len: Optional[int] = None - # TODO: change Enforce eager to False once SageMaker driver issue resolved - enforce_eager: Optional[bool] = False - # TODO: this default may change with different vLLM versions - # TODO: try to get good default from vLLM to prevent revisiting - # TODO: last time check: vllm 0.3.1 - gpu_memory_utilization: Optional[float] = 0.9 - # TODO: speculative decoding changes - speculative_draft_model: Optional[str] = None - speculative_length: int = 4 - draft_model_tp_size: int = 1 - record_acceptance_rate: Optional[bool] = False - speculative_telemetry: Optional[bool] = True - enable_lora: Optional[bool] = False - max_loras: Optional[int] = 4 - max_lora_rank: Optional[int] = 16 - fully_sharded_loras: bool = False - lora_extra_vocab_size: Optional[int] = 256 - long_lora_scaling_factors: Optional[Tuple[float, ...]] = None - lora_dtype: Optional[str] = 'auto' - max_cpu_loras: Optional[int] = None - max_logprobs: Optional[int] = 20 - enable_chunked_prefill: Optional[bool] = None - cpu_offload_gb_per_gpu: Optional[int] = 0 - enable_prefix_caching: Optional[bool] = False - disable_sliding_window: Optional[bool] = False - limit_mm_per_prompt: Optional[Mapping[str, int]] = None - use_passive_workers: Optional[bool] = True - tokenizer_mode: str = 'auto' - - @model_validator(mode='after') - def validate_mpi(self): - if not self.mpi_mode: - raise AssertionError( - f"Need MPI engine to start lmi-dist RollingBatcher") - return self - - @model_validator(mode='after') - def validate_speculative_and_lora(self): - if self.enable_lora and self.speculative_draft_model: - raise AssertionError( - f"Cannot enable lora and speculative decoding at the same time" - ) - return self - - @model_validator(mode='after') - def validate_speculative_and_fml(self): - if self.load_format == LmiDistLoadFormats.sagemaker_fast_model_loader.value and self.speculative_draft_model: - raise AssertionError( - f"Cannot enable sagemaker_fast_model_loader and speculative decoding at the same time" - ) - return self - - @field_validator('long_lora_scaling_factors', mode='before') - def validate_long_lora_scaling_factors(cls, val): - if isinstance(val, str): - val = ast.literal_eval(val) - if not isinstance(val, tuple): - if isinstance(val, list): - val = tuple(float(v) for v in val) - elif isinstance(val, float): - val = (val, ) - elif isinstance(val, int): - val = (float(val), ) - else: - raise ValueError( - "long_lora_scaling_factors must be convertible to a tuple of floats." - ) - return val - - @field_validator('limit_mm_per_prompt', mode="before") - def validate_limit_mm_per_prompt(cls, val) -> Mapping[str, int]: - out_dict: Dict[str, int] = {} - for item in val.split(","): - kv_parts = [part.lower().strip() for part in item.split("=")] - if len(kv_parts) != 2: - raise ValueError("Each item should be in the form key=value") - key, value = kv_parts - - try: - parsed_value = int(value) - except ValueError as e: - raise ValueError( - f"Failed to parse value of item {key}={value}") from e - - if key in out_dict and out_dict[key] != parsed_value: - raise ValueError( - f"Conflicting values specified for key: {key}") - out_dict[key] = parsed_value - return out_dict diff --git a/engines/python/setup/djl_python/properties_manager/properties.py b/engines/python/setup/djl_python/properties_manager/properties.py index f48419e87..c8dd9ee91 100644 --- a/engines/python/setup/djl_python/properties_manager/properties.py +++ b/engines/python/setup/djl_python/properties_manager/properties.py @@ -20,7 +20,6 @@ class RollingBatchEnum(str, Enum): vllm = "vllm" tnx = "tnx" - lmidist = "lmi-dist" auto = "auto" disable = "disable" trtllm = "trtllm" diff --git a/engines/python/setup/djl_python/properties_manager/tnx_properties.py b/engines/python/setup/djl_python/properties_manager/tnx_properties.py index 52c67e70b..6b594c9f3 100644 --- a/engines/python/setup/djl_python/properties_manager/tnx_properties.py +++ b/engines/python/setup/djl_python/properties_manager/tnx_properties.py @@ -78,7 +78,7 @@ class TnXMemoryLayout(str, Enum): TNX_SUPPORTED_ROLLING_BATCH_TYPES = [ RollingBatchEnum.auto.value, RollingBatchEnum.vllm.value, - RollingBatchEnum.lmidist.value, RollingBatchEnum.tnx.value + RollingBatchEnum.tnx.value ] diff --git a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py deleted file mode 100644 index ac7ee8cd3..000000000 --- a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py +++ /dev/null @@ -1,303 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file -# except in compliance with the License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" -# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for -# the specific language governing permissions and limitations under the License. -import logging -import os -from typing import List, Optional -from collections import OrderedDict, defaultdict - -from lmi_dist.api import Request, RequestParams -from lmi_dist.arg_utils import VllmEngineArgs -from lmi_dist.init_engine import engine_from_args -from lmi_dist.seq2seq_engine import Seq2SeqPreprocessor -from vllm.sampling_params import RequestOutputKind -from vllm.utils import AtomicCounter - -from djl_python.rolling_batch.rolling_batch import RollingBatch, stop_on_any_exception, filter_unused_generation_params -from djl_python.rolling_batch.rolling_batch_vllm_utils import ( - get_speculative_decoding_metrics_record, update_request_cache_with_output, - supports_speculative_decoding, create_lora_request, get_lora_request, - DTYPE_MAPPER, get_prompt_inputs) -from djl_python.telemetry import telemetry_manager -from djl_python.properties_manager.lmi_dist_rb_properties import LmiDistRbProperties - -LMI_DIST_GENERATION_PARAMS = set(RequestParams().__struct_fields__) - - -class LmiDistRollingBatch(RollingBatch): - """ - LmiDistRollingBatch connects handler to LmiDist backend engine. It receives new - requests from the handler and sends them to the backend when space is available in the batch. - It also gets any new tokens from the backend and sends them back to the handler. - """ - - def __init__(self, model_id_or_path: str, properties: dict, **kwargs): - """ - Initializes the LmiDistRollingBatch. - - :param model_id_or_path (str): Currently unused since there is a copy inside properties - :param properties (dict): other properties of the model, such as decoder strategy - """ - self.lmi_dist_config = LmiDistRbProperties(**properties) - super().__init__(self.lmi_dist_config) - self.supports_speculative_decoding = supports_speculative_decoding() - engine_kwargs = {} - if self.supports_speculative_decoding: - engine_kwargs[ - "draft_model"] = self.lmi_dist_config.speculative_draft_model - engine_kwargs[ - "speculate_length"] = self.lmi_dist_config.speculative_length - engine_kwargs[ - "draft_model_tp_size"] = self.lmi_dist_config.draft_model_tp_size - engine_args = VllmEngineArgs( - model=self.lmi_dist_config.model_id_or_path, - tensor_parallel_size=self.lmi_dist_config.tensor_parallel_degree, - pipeline_parallel_size=self.lmi_dist_config. - pipeline_parallel_degree, - dtype=DTYPE_MAPPER[self.lmi_dist_config.dtype], - seed=0, - max_model_len=self.lmi_dist_config.max_model_len, - max_num_seqs=self.lmi_dist_config.max_rolling_batch_size, - enforce_eager=self.lmi_dist_config.enforce_eager, - gpu_memory_utilization=self.lmi_dist_config.gpu_memory_utilization, - max_num_batched_tokens=self.lmi_dist_config. - max_rolling_batch_prefill_tokens, - trust_remote_code=self.lmi_dist_config.trust_remote_code, - load_format=self.lmi_dist_config.load_format, - quantization=self.lmi_dist_config.quantize, - enable_lora=self.lmi_dist_config.enable_lora, - max_loras=self.lmi_dist_config.max_loras, - max_lora_rank=self.lmi_dist_config.max_lora_rank, - fully_sharded_loras=self.lmi_dist_config.fully_sharded_loras, - lora_extra_vocab_size=self.lmi_dist_config.lora_extra_vocab_size, - long_lora_scaling_factors=self.lmi_dist_config. - long_lora_scaling_factors, - lora_dtype=self.lmi_dist_config.lora_dtype, - max_cpu_loras=self.lmi_dist_config.max_cpu_loras, - revision=self.lmi_dist_config.revision, - enable_chunked_prefill=self.lmi_dist_config.enable_chunked_prefill, - cpu_offload_gb=self.lmi_dist_config.cpu_offload_gb_per_gpu, - enable_prefix_caching=self.lmi_dist_config.enable_prefix_caching, - disable_sliding_window=self.lmi_dist_config.disable_sliding_window, - limit_mm_per_prompt=self.lmi_dist_config.limit_mm_per_prompt, - use_passive_workers=self.lmi_dist_config.use_passive_workers, - tokenizer_mode=self.lmi_dist_config.tokenizer_mode, - **engine_kwargs) - - kwargs = {} - logging.info(f"engine_args: {engine_args}, kwargs: {kwargs}") - - if self.lmi_dist_config.max_rolling_batch_prefill_tokens is None: - logging.warning( - "djl-serving/lmi has changed the default behavior for max_rolling_batch_prefill_tokens in 0.30.0 (lmi v12). " - "Previously, when max_rolling_batch_prefill_tokens was unset, djl-serving would use a warmup prefill limit of 4096 tokens. " - "This behavior differs from vLLM's default behavior, which (essentially) defaults to max_model_len. As a result of this change, " - "model deployments that worked previously may fail due to higher memory requirements at model loading time for the warmup phase. " - "For more information on this change, and guidance on what configurations to set, please see " - "https://github.com/deepjavalibrary/djl-serving/tree/master/serving/docs/lmi/announcements/breaking_changes.md" - ) - self.engine = engine_from_args(engine_args, **kwargs) - self.request_cache = OrderedDict() - self.lora_id_counter = AtomicCounter(0) - self.lora_requests = {} - self.is_mistral_tokenizer = self.lmi_dist_config.tokenizer_mode == 'mistral' - self.is_t5_model = isinstance(self.engine.preprocessor, - Seq2SeqPreprocessor) - - def reset(self) -> None: - """ - Aborts all requests - """ - self.engine.reset(self.request_cache.keys()) - self.request_cache = OrderedDict() - super().reset() - - def get_tokenizer(self): - if self.is_t5_model: - return self.engine.preprocessor.tokenizer - return self.engine.preprocessor.tokenizer.tokenizer - - def get_model_config(self): - # TODO: this is a hack right now to get the model config from the engine. We should expose this as - # an interface method and retrieve it from there after v12 - return self.engine.preprocessor.model_config if not self.is_t5_model else None - - def use_vllm_chat_completions(self): - # vllm chat parsing requires 0.7.1 currently, lmi-dist is on 0.6.3.post1 - return False - - def get_huggingface_model_config(self): - # TODO: this is a hack right now to get the model config from the engine. We should expose this as - # an interface method and retrieve it from there after v12 - return self.engine.preprocessor.model_config.hf_config if not self.is_t5_model else None - - def translate_lmi_dist_params(self, parameters: dict): - """ - Helper function to convert DJL Serving parameter names to parameter names - that lmi-dist recognizes. - - :param parameters (dict): Parameters pertaining to a specific request - - :return: The same parameters dict, but with lmi-dist style parameter names. - """ - parameters["output_kind"] = RequestOutputKind.DELTA - parameters["max_tokens"] = parameters.pop("max_new_tokens", 30) - do_sample = parameters.pop("do_sample", None) - if do_sample is not None and do_sample is False: - parameters["temperature"] = 0.0 - if do_sample is None and parameters.get("temperature") is None: - parameters["temperature"] = 0.0 - if "seed" in parameters.keys(): - parameters["seed"] = int(parameters["seed"]) - if "stop_sequences" in parameters: - parameters["stop"] = parameters.pop("stop_sequences") - if "ignore_eos_token" in parameters: - parameters["ignore_eos"] = parameters.pop("ignore_eos_token") - if "num_beams" in parameters: - parameters["best_of"] = parameters.pop("num_beams") - parameters["use_beam_search"] = True - if parameters.pop("decoder_input_details", False): - parameters["prompt_logprobs"] = 1 - if "best_of" in parameters: - # if n is not explicitly set, we return `best_of` values sequences. - if "n" not in "best_of": - parameters["n"] = parameters["best_of"] - if "top_n_tokens" in parameters: - parameters["logprobs"] = parameters.pop("top_n_tokens") - else: - parameters["logprobs"] = parameters.get("logprobs", 1) - parameters = filter_unused_generation_params( - parameters, - LMI_DIST_GENERATION_PARAMS, - "lmi-dist", - remove_unused_params=True) - return parameters - - @stop_on_any_exception - def inference(self, new_requests: List[Request]) -> List: - """ - Adds new requests and gets output tokens from the backend. - - :param new_requests: List of requests - - :return results: List of dictionaries, one for each request, that contain output tokens and other data. - """ - self.add_new_requests(new_requests) - # step 0: register new requests to engine - new_lmi_dist_requests = [] - for request in new_requests: - request_id = str(request.id) - prompt_inputs = get_prompt_inputs(request) - params = self.translate_lmi_dist_params(request.parameters) - request_params = RequestParams(**params) - lora_request_params = dict() - if request.adapter is not None: - adapter_name = request.adapter.get_property("name") - lora_request_params["lora_request"] = get_lora_request( - adapter_name, self.lora_requests) - # Constructing Request in lmi-dist library - lmi_dist_request = Request(id=request_id, - prompt=prompt_inputs, - params=request_params, - **lora_request_params) - new_lmi_dist_requests.append(lmi_dist_request) - self.request_cache[request_id] = { - "request_output": request.request_output - } - if new_lmi_dist_requests: - self.engine.add_requests(new_lmi_dist_requests) - - request_outputs = self.engine.step() - - # step 1: put result to cache - for request_output in request_outputs: - self.request_cache = update_request_cache_with_output( - self.request_cache, request_output, self.get_tokenizer()) - # Record SD metrics - completion_output = request_output.outputs[0] - if ( - self.lmi_dist_config.record_acceptance_rate - or self.lmi_dist_config.speculative_telemetry - ) and self.lmi_dist_config.speculative_draft_model and request_output.finished: - try: - if self.supports_speculative_decoding and hasattr( - completion_output, 'acceptance_history'): - record = get_speculative_decoding_metrics_record( - completion_output, request_output) - if self.lmi_dist_config.record_acceptance_rate: - logging.info(f"Speculative Decoding {record}") - if self.lmi_dist_config.speculative_telemetry and os.environ.get( - "SAGEMAKER_SECURE_MODE") == "true": - telemetry_manager.record_speculative(record) - except: - logging.debug("SD telemetry collection failed, ignore") - - for request in self.active_requests: - request_output = request.request_output - if request_output.finished: - prompt_len = len(request_output.prompt_tokens_details) - if self.get_model_config(): - max_model_len = self.get_model_config().max_model_len - if prompt_len > max_model_len: - raise ValueError( - f"Input prompt ({prompt_len} tokens) is too long and exceeds limit of {max_model_len}" - ) - request.last_token = True - - return self.postprocess_results() - - def preprocess_requests(self, requests): - """ - Currently not applicable for lmi-dist. - """ - raise NotImplementedError( - "Not implemented for lmidist rolling batcher") - - def add_lora(self, - lora_name: str, - lora_path: str, - long_lora_max_len: Optional[int] = None): - """ - Add LoRA adapter. - """ - lora_id = self.lora_id_counter.inc(1) - lora_request = create_lora_request(lora_name, - lora_id, - lora_path, - long_lora_max_len=long_lora_max_len) - self.lora_requests[lora_request.lora_name] = lora_request - return self.engine.add_lora(lora_request) - - def remove_lora(self, lora_name): - """ - Remove LoRA adapter. - """ - lora_request = get_lora_request(lora_name, self.lora_requests) - return self.engine.remove_lora(lora_request.lora_int_id) - - def pin_lora(self, lora_name): - """ - Pin LoRA adapter. - """ - lora_request = get_lora_request(lora_name, self.lora_requests) - - # To pin an adapter, adapter has to be registered already (by calling add_lora()). - # If trying to pin an adapter that is not registered, we will get "LoRA is not registered" error. - # However, registered adapters are maintained by LRUCache - # and may be evicted if the number of adapters exceed capacity (max_cpu_loras). - # So there will be two scenarios: - # 1) An adapter is evicted, call add_lora() is necessary to avoid error. - # 2) An adapter is not evicted, call add_lora() is not necessary. - # But since whether an adapter is evicted is not exposed outside of engine, - # and add_lora() in this case will take negligible time, we will still call add_lora(). - loaded = self.engine.add_lora(lora_request) - return loaded and self.engine.pin_lora(lora_request.lora_int_id) diff --git a/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py b/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py index f037d4f97..99a2a2b00 100644 --- a/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py +++ b/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py @@ -43,7 +43,7 @@ def update_request_cache_with_output(request_cache: OrderedDict, cache = request_cache[request_id] request_output = cache["request_output"] - # For beam search, vllm and lmi-dist produces entirely different sequences at the same index + # For beam search, vllm produces entirely different sequences at the same index # after a certain step, despite tracking previous outputs. This leads to garbage output, so we wait till # entire generation finishes. parameters = request_output.input.parameters @@ -185,9 +185,9 @@ def get_speculative_decoding_metrics_record( def supports_speculative_decoding() -> bool: try: - # Moved the import inside a try to support neuron vllm container w/o lmi-dist - from lmi_dist.arg_utils import VllmEngineArgs - return "draft_model" in VllmEngineArgs.__annotations__ + # Check if vllm supports speculative decoding by looking for draft_model parameter + from vllm import EngineArgs + return "draft_model" in EngineArgs.__annotations__ except ImportError: return False diff --git a/engines/python/setup/djl_python/tests/rolling_batch_test_scripts/run_rolling_batch_alone.py b/engines/python/setup/djl_python/tests/rolling_batch_test_scripts/run_rolling_batch_alone.py index 8c450ced5..1485a99e8 100644 --- a/engines/python/setup/djl_python/tests/rolling_batch_test_scripts/run_rolling_batch_alone.py +++ b/engines/python/setup/djl_python/tests/rolling_batch_test_scripts/run_rolling_batch_alone.py @@ -4,7 +4,7 @@ pip install git+https://github.com/deepjavalibrary/djl-serving.git#subdirectory=engines/python/setup torchrun --standalone --nnodes=1 --nproc-per-node=4 \ - run_rolling_batch_alone.py openlm-research/open_llama_7b_v2 -rb lmi-dist + run_rolling_batch_alone.py openlm-research/open_llama_7b_v2 -rb vllm """ import argparse import logging @@ -16,10 +16,7 @@ def get_rolling_batch_class_from_str(rolling_batch_type: str): - if rolling_batch_type == "lmi-dist": - from djl_python.rolling_batch.lmi_dist_rolling_batch import LmiDistRollingBatch - return LmiDistRollingBatch - elif rolling_batch_type == "vllm": + if rolling_batch_type == "vllm": from djl_python.rolling_batch.vllm_rolling_batch import VLLMRollingBatch logging.warning( "vLLM rolling batcher is experimental, use with caution") @@ -142,7 +139,7 @@ def simulator(batcher, parser.add_argument("-rb", "--rollingbatch", type=str, - choices=["vllm", "lmi-dist", "neuron"]) + choices=["vllm", "neuron"]) parser.add_argument("--properties", type=str, required=False, @@ -157,9 +154,7 @@ def simulator(batcher, "trust_remote_code": True, "engine": "Python" } - if args.rollingbatch == "lmi-dist": - dist.init_process_group("nccl") - properties["engine"] = "MPI" + batcher = init_rolling_batch(args.rollingbatch, args.model_id, properties) simulator(batcher, "write a program that can sum two number in python", { "max_new_tokens": 256, diff --git a/engines/python/setup/djl_python/tests/test_input_output.py b/engines/python/setup/djl_python/tests/test_input_output.py index 20eb8fcc9..0bbead0f4 100644 --- a/engines/python/setup/djl_python/tests/test_input_output.py +++ b/engines/python/setup/djl_python/tests/test_input_output.py @@ -42,8 +42,8 @@ def test_concurrent_batch(self): }] properties = [{"eula": "true", "Content-type": "application/json"}] serving_properties = { - "engine": "MPI", - "option.rolling_batch": "lmi-dist", + "engine": "Python", + "option.rolling_batch": "vllm", "option.model_id": "llama-70b" } inputs = test_model.create_concurrent_batch_request( diff --git a/engines/python/setup/djl_python/tests/test_properties_manager.py b/engines/python/setup/djl_python/tests/test_properties_manager.py index 27420ec39..68164fbef 100644 --- a/engines/python/setup/djl_python/tests/test_properties_manager.py +++ b/engines/python/setup/djl_python/tests/test_properties_manager.py @@ -13,7 +13,6 @@ from djl_python.properties_manager.hf_properties import HuggingFaceProperties from djl_python.properties_manager.vllm_rb_properties import VllmRbProperties from djl_python.properties_manager.sd_inf2_properties import StableDiffusionNeuronXProperties -from djl_python.properties_manager.lmi_dist_rb_properties import LmiDistRbProperties from djl_python.tests.utils import parameterized, parameters import torch @@ -305,22 +304,13 @@ def test_tnx_configs_error_case(self, params): @parameters([{ "rolling_batch": "auto", - }, { - "rolling_batch": "lmi-dist", - "is_error_case": True }]) def test_trt_llm_configs(self, params): - is_error_case = params.pop("is_error_case", False) properties = {**model_min_properties, **params} - if is_error_case: - with self.assertRaises(ValueError): - TensorRtLlmProperties(**properties) - else: - trt_configs = TensorRtLlmProperties(**properties) - self.assertEqual(trt_configs.model_id_or_path, - properties['model_id']) - self.assertEqual(trt_configs.rolling_batch.value, - properties['rolling_batch']) + trt_configs = TensorRtLlmProperties(**properties) + self.assertEqual(trt_configs.model_id_or_path, properties['model_id']) + self.assertEqual(trt_configs.rolling_batch.value, + properties['rolling_batch']) def test_hf_configs(self): properties = { @@ -401,15 +391,6 @@ def test_hf_device_map(self, mock_device_count): rolling_batch="disable") self.assertIsNone(hf_configs.kwargs.get("device_map")) - def test_hf_quantize(self): - properties = { - 'model_id': 'model_id', - 'quantize': 'bitsandbytes8', - 'rolling_batch': 'lmi-dist' - } - hf_configs = HuggingFaceProperties(**properties) - self.assertEqual(hf_configs.quantize, "bitsandbytes") - @parameters([{ "model_id": "model_id", "quantize": "bitsandbytes4" @@ -700,93 +681,6 @@ def test_sd_inf2_properties_errors(self, params): with self.assertRaises(ValueError): StableDiffusionNeuronXProperties(**test_properties) - def test_lmi_dist_properties(self): - - def test_with_min_properties(): - lmi_configs = LmiDistRbProperties(**min_properties) - self.assertEqual(lmi_configs.model_id_or_path, - min_properties['model_id']) - self.assertEqual(lmi_configs.load_format, 'auto') - self.assertEqual(lmi_configs.dtype, 'auto') - self.assertEqual(lmi_configs.gpu_memory_utilization, 0.9) - self.assertTrue(lmi_configs.mpi_mode) - self.assertFalse(lmi_configs.enable_lora) - - def test_with_most_properties(): - properties = { - 'trust_remote_code': 'TRUE', - 'tensor_parallel_degree': '2', - 'revision': 'somerevisionstr', - 'max_rolling_batch_size': '64', - 'max_rolling_batch_prefill_tokens': '12500', - 'dtype': 'fp32', - 'enable_lora': "true", - } - - lmi_configs = LmiDistRbProperties(**properties, **min_properties) - self.assertEqual(lmi_configs.engine, min_properties['engine']) - self.assertEqual(lmi_configs.model_id_or_path, - min_properties['model_id']) - self.assertEqual(lmi_configs.tensor_parallel_degree, - int(properties['tensor_parallel_degree'])) - self.assertEqual(lmi_configs.revision, properties['revision']) - self.assertEqual(lmi_configs.max_rolling_batch_size, - int(properties['max_rolling_batch_size'])) - self.assertEqual( - lmi_configs.max_rolling_batch_prefill_tokens, - int(properties['max_rolling_batch_prefill_tokens'])) - self.assertEqual(lmi_configs.dtype, 'fp32') - self.assertTrue(lmi_configs.mpi_mode) - self.assertTrue(lmi_configs.trust_remote_code) - self.assertEqual(lmi_configs.enable_lora, - bool(properties['enable_lora'])) - - def test_quantization_squeezellm(): - properties = {'quantize': 'squeezellm'} - lmi_configs = LmiDistRbProperties(**properties, **min_properties) - self.assertEqual(lmi_configs.quantize, "squeezellm") - - def test_long_lora_scaling_factors(): - properties = {"long_lora_scaling_factors": "3.0"} - lmi_configs = LmiDistRbProperties(**properties, **min_properties) - self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, )) - - properties = {"long_lora_scaling_factors": "3"} - lmi_configs = LmiDistRbProperties(**properties, **min_properties) - self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, )) - - properties = {"long_lora_scaling_factors": "3.0,4.0"} - lmi_configs = LmiDistRbProperties(**properties, **min_properties) - self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0)) - - properties = {"long_lora_scaling_factors": "3.0, 4.0 "} - lmi_configs = LmiDistRbProperties(**properties, **min_properties) - self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0)) - - properties = {"long_lora_scaling_factors": "(3.0,)"} - lmi_configs = LmiDistRbProperties(**properties, **min_properties) - self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, )) - - properties = {"long_lora_scaling_factors": "(3.0,4.0)"} - lmi_configs = LmiDistRbProperties(**properties, **min_properties) - self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0)) - - def test_invalid_long_lora_scaling_factors(): - properties = {'long_lora_scaling_factors': "(a,b)"} - with self.assertRaises(ValueError): - LmiDistRbProperties(**properties, **min_properties) - - min_properties = { - 'engine': 'MPI', - 'mpi_mode': 'true', - 'model_id': 'sample_model_id', - } - test_with_min_properties() - test_with_most_properties() - test_quantization_squeezellm() - test_long_lora_scaling_factors() - test_invalid_long_lora_scaling_factors() - if __name__ == '__main__': unittest.main() diff --git a/engines/python/setup/djl_python/utils.py b/engines/python/setup/djl_python/utils.py index 4b2a4c3f7..b9ea02aea 100644 --- a/engines/python/setup/djl_python/utils.py +++ b/engines/python/setup/djl_python/utils.py @@ -87,7 +87,7 @@ def is_beam_search(parameters: dict) -> bool: def is_multiple_sequences(parameters: dict) -> bool: """ Returns whether the parameters indicate number of output sequences to return is more than 1. - When the user give us n, best_of is automatically applied in vllm and lmi-dist. + When the user give us n, best_of is automatically applied in vllm. :param parameters: parameters dictionary :return: boolean """ diff --git a/engines/python/src/main/java/ai/djl/python/engine/Request.java b/engines/python/src/main/java/ai/djl/python/engine/Request.java index 224f434c0..9d4d99a37 100644 --- a/engines/python/src/main/java/ai/djl/python/engine/Request.java +++ b/engines/python/src/main/java/ai/djl/python/engine/Request.java @@ -72,9 +72,8 @@ Set> getProperties() { } /** - * Seed is required for LMI Dist for sampling for all processes in the MPI to generate the same - * token. NextTokenChooserParameters is constructed during first forward and preserved for all - * forward calls of the request. + * NextTokenChooserParameters is constructed during first forward and preserved for all forward + * calls of the request. * * @return seed, only for first forward */ diff --git a/plugins/secure-mode/src/main/java/ai/djl/serving/plugins/securemode/SecureModeAllowList.java b/plugins/secure-mode/src/main/java/ai/djl/serving/plugins/securemode/SecureModeAllowList.java index 8ee08867a..dfb01ca39 100644 --- a/plugins/secure-mode/src/main/java/ai/djl/serving/plugins/securemode/SecureModeAllowList.java +++ b/plugins/secure-mode/src/main/java/ai/djl/serving/plugins/securemode/SecureModeAllowList.java @@ -111,8 +111,5 @@ interface SecureModeAllowList { "option.pythonExecutable"); public static final Set PYTHON_EXECUTABLE_ALLOWLIST = - Set.of( - "/opt/djl/lmi_dist_venv/bin/python", - "/opt/djl/vllm_venv/bin/python", - "/usr/bin/python3"); + Set.of("/opt/djl/vllm_venv/bin/python", "/usr/bin/python3"); } diff --git a/plugins/secure-mode/src/test/java/ai/djl/serving/plugins/securemode/SecureModePluginTest.java b/plugins/secure-mode/src/test/java/ai/djl/serving/plugins/securemode/SecureModePluginTest.java index 532024e19..f5d241464 100644 --- a/plugins/secure-mode/src/test/java/ai/djl/serving/plugins/securemode/SecureModePluginTest.java +++ b/plugins/secure-mode/src/test/java/ai/djl/serving/plugins/securemode/SecureModePluginTest.java @@ -292,7 +292,7 @@ void testAllowedPythonExecutablePath() throws IOException, ModelException { mockSecurityEnv( "foo", TEST_MODEL_DIR.resolve("serving.properties"), - "option.pythonExecutable=/opt/djl/lmi_dist_venv/bin/python"); + "option.pythonExecutable=/opt/djl/vllm_venv/bin/python"); } private void createFileWithContent(Path file, String content) throws IOException { diff --git a/serving/docker/partition/sm_neo_dispatcher.py b/serving/docker/partition/sm_neo_dispatcher.py index d689af1df..0805f7d1e 100644 --- a/serving/docker/partition/sm_neo_dispatcher.py +++ b/serving/docker/partition/sm_neo_dispatcher.py @@ -21,7 +21,6 @@ VALID_LOAD_FORMATS = ["sagemaker_fast_model_loader"] # Paths to each Python executable -LMI_DIST_VENV_EXEC = "/opt/djl/lmi_dist_venv/bin/python" VLLM_VENV_EXEC = "/opt/djl/vllm_venv/bin/python" SYSTEM_PY_EXEC = "/usr/bin/python3" @@ -123,27 +122,17 @@ def dispatch(self): present serving properties. """ match self.serving_features: - case "vllm,lmi-dist": + case "vllm": if self.is_valid_sharding_config(): - if self.properties.get("option.rolling_batch", - "lmi-dist").lower() == "vllm": - python_exec = VLLM_VENV_EXEC - else: - python_exec = LMI_DIST_VENV_EXEC + python_exec = VLLM_VENV_EXEC print("Sharding Model...") self.run_task(NeoTask.SHARDING, python_exec) else: - if self.properties.get("option.quantize", - "").lower() == "fp8": - python_exec = VLLM_VENV_EXEC - else: - # run awq quantization with lmi-dist venv b/c AutoAWQ - # is incompatible with newer transformers - python_exec = LMI_DIST_VENV_EXEC + python_exec = VLLM_VENV_EXEC self.run_task(NeoTask.QUANTIZATION, python_exec) case "trtllm": self.run_task(NeoTask.TENSORRT_LLM, SYSTEM_PY_EXEC) - case "vllm,lmi-dist,tnx": + case "vllm,tnx": self.run_task(NeoTask.NEURON, SYSTEM_PY_EXEC) case _: raise ValueError( diff --git a/serving/docker/partition/sm_neo_shard.py b/serving/docker/partition/sm_neo_shard.py deleted file mode 100644 index b6e420a08..000000000 --- a/serving/docker/partition/sm_neo_shard.py +++ /dev/null @@ -1,274 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file -# except in compliance with the License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" -# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for -# the specific language governing permissions and limitations under the License. -import json -import os -import shutil -import sys -import logging -from importlib.metadata import version -from typing import Final, Optional - -from sm_neo_utils import (OptimizationFatalError, write_error_to_file, - get_neo_env_vars) -from utils import (update_kwargs_with_env_vars, load_properties) - -import torch -from mpi4py import MPI - -from lmi_dist.init_engine import engine_from_args -from lmi_dist.arg_utils import VllmEngineArgs -from lmi_dist.comms import comms -from lmi_dist.vllm_engine import load_model_for_sharding - -CHUNK_MB = 8 - - -class NeoShardingService(): - - def __init__(self): - neo_environ = get_neo_env_vars() - self.INPUT_MODEL_DIRECTORY: Final[str] = neo_environ[ - "SM_NEO_INPUT_MODEL_DIR"] - self.OUTPUT_MODEL_DIRECTORY: Final[str] = neo_environ[ - "SM_NEO_COMPILED_MODEL_DIR"] - self.COMPILATION_ERROR_FILE: Final[str] = neo_environ[ - "SM_NEO_COMPILATION_ERROR_FILE"] - - self.properties: dict = update_kwargs_with_env_vars({}) - self.properties.update(load_properties(self.INPUT_MODEL_DIRECTORY)) - import sagemaker_fast_model_loader_rust as sm_fml - py_version = "{}.{}.{}".format(*sys.version_info[:3]) - - self.pp_degree = int( - self.properties.get("option.pipeline_parallel_degree", 1)) - self.tp_degree = int(self.properties["option.tensor_parallel_degree"]) - self.shard_config = sm_fml.ModelConfig( - pipeline_parallel_size=self.pp_degree, - tensor_parallel_size=self.tp_degree, - framework=sm_fml.ModelFramework.Vllm, - framework_version=version("vllm"), - python_version=py_version, - ) - - def add_shard_configs( - self, - partial_configs: list, - ): - for entry in partial_configs: - if not entry["config"]: - continue - self.shard_config.add_shard( - pipeline_parallel_degree=int(entry["pp"]), - tensor_parallel_degree=int(entry["tp"]), - shard_config=entry["config"], - ) - - def save_configs(self, input_dir: str = "", output_dir: str = "") -> None: - self.shard_config.save(output_dir=output_dir) - logging.info( - f"SageMaker Fast Model Loader config file saved to {output_dir}") - self.copy_non_safetensors_files(input_dir, output_dir) - logging.info(f"Other non-Safetensors files copied to {output_dir}") - - def copy_non_safetensors_files(self, input_dir: str, output_dir: str): - """ - Copy all files that are not Safetensors weights from input dir to output dir - """ - index_json_path = os.path.join(input_dir, - "model.safetensors.index.json") - if os.path.exists(index_json_path): - with open(index_json_path, "r") as f: - index_data = json.load(f) - safetensors_files = list(index_data["weight_map"].values()) - else: - # If the index file doesn't exist, assume there is only a single model.safetensors file - safetensors_files = ["model.safetensors"] - - for item in os.listdir(input_dir): - item_path = os.path.join(input_dir, item) - if item not in safetensors_files and item != "model.safetensors.index.json": - if os.path.isfile(item_path): - shutil.copy2(item_path, os.path.join(output_dir, item)) - elif os.path.isdir(item_path): - shutil.copytree(item_path, os.path.join(output_dir, item)) - - def generate_properties_file(self): - with open( - os.path.join(self.OUTPUT_MODEL_DIRECTORY, - "serving.properties"), "w") as f: - for key, value in self.properties.items(): - f.write(f"{key}={value}\n") - - # By setting pp_rank and tp_rank_interval , only workers in those ranks will load the model - # i.e. in case pp=2, tp=4, the arg of pp_rank=1, tp_interval = [2,3,4] - # only workers with rank 5, 6, 7 load the model - def shard_lmi_dist_model(self, input_dir: str, output_dir: str, - pp_degree: int, tp_degree: int, chunk_mb: int, - target_pp_rank: int, - target_tp_rank_interval) -> None: - # For engine args which can affect GPU memory utilization, use LMI defaults - # unless specified otherwise by the customer - gpu_memory_utilization = float( - self.properties.get("option.gpu_memory_utilization", 0.9)) - enforce_eager: bool = self.properties.get("option.enforce_eager", - "false").lower() == "true" - max_rolling_batch_size = int( - self.properties.get("option.max_rolling_batch_size", 256)) - max_model_len = self.properties.get("option.max_model_len", None) - if max_model_len is not None: - max_model_len = int(max_model_len) - - # LoraConfigs - lora_kwargs = {} - if enable_lora := self.properties.get("option.enable_lora"): - enable_lora_bool = enable_lora.lower() == "true" - - if enable_lora_bool: - max_loras: int = int( - self.properties.get("option.max_loras", "4")) - max_lora_rank: int = int( - self.properties.get("option.max_lora_rank", "16")) - fully_sharded_loras: bool = str( - self.properties.get("option.fully_sharded_loras", - "false")).lower() == "true" - lora_extra_vocab_size: int = int( - self.properties.get("option.lora_extra_vocab_size", "256")) - lora_dtype: str = self.properties.get("option.lora_dtype", - "auto") - max_cpu_loras: Optional[int] = None - if cpu_loras := self.properties.get("option.max_cpu_loras"): - max_cpu_loras = int(cpu_loras) - - lora_kwargs["enable_lora"] = enable_lora_bool - lora_kwargs["fully_sharded_loras"] = fully_sharded_loras - lora_kwargs["max_loras"] = max_loras - lora_kwargs["max_lora_rank"] = max_lora_rank - lora_kwargs["lora_extra_vocab_size"] = lora_extra_vocab_size - lora_kwargs["lora_dtype"] = lora_dtype - lora_kwargs["max_cpu_loras"] = max_cpu_loras - - engine_args = VllmEngineArgs( - model=input_dir, - pipeline_parallel_size=pp_degree, - tensor_parallel_size=tp_degree, - disable_custom_all_reduce=True, - distributed_executor_backend="mp", - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - max_num_seqs=max_rolling_batch_size, - max_model_len=max_model_len, - **lora_kwargs, - ) - - engine_configs = engine_args.create_engine_configs() - engine_worker = load_model_for_sharding(engine_configs, target_pp_rank, - target_tp_rank_interval) - - # Lazy import to avoid MPI not-inited errors - import sagemaker_fast_model_loader_rust as sm_fml - model_dir = os.path.join(output_dir, sm_fml.MODEL_DIR_NAME) - os.makedirs(model_dir, exist_ok=True) - - config_for_current_rank = engine_worker.save_chunked_shard( - output_dir=model_dir, - chunk_mb=chunk_mb, - target_pp_rank=target_pp_rank, - target_tp_rank_interval=target_tp_rank_interval) - - # Gather results from all ranks to driver process - configs = MPI.COMM_WORLD.gather(config_for_current_rank, root=0) - - # Driver process saves configs of current rank to disk - if comms.rank == 0: - self.add_shard_configs(configs) - - del engine_worker - torch.cuda.empty_cache() - MPI.COMM_WORLD.Barrier() - print( - f"Memory after cleaning {torch.cuda.memory_allocated()/(1024**3)} GB" - ) - - def generate_tensor_parallel_intervals(self, num_gpus, tp_degree): - """ - Generate intervals for tensor parallel partitions across available GPUs. - - Args: - num_gpus (int): Number of available GPUs - tp_degree (int): Tensor parallel degree - - Returns: - list: List of lists containing the partition intervals - """ - intervals = [] - start = 0 - - while start < tp_degree: - end = min(start + num_gpus, tp_degree) - interval = list(range(start, end)) - intervals.append(interval) - start = end - - return intervals - - def run_sharding(self): - try: - device_count = torch.cuda.device_count() - # This is to generate shards by batch - # Example 1: TP=4, PP=2 on 4-GPU instance - # batch 1: PP=0, TP=[0,1,2,3] - # batch 2: PP=1, TP=[0,1,2,3] - # Example 2: TP=8, PP=1 on 4-GPU instance - # batch 1: PP=0, TP=[0,1,2,3] - # batch 2: PP=0, TP=[4,5,6,7] - for pp_rank in range(self.pp_degree): - for tp_interval in self.generate_tensor_parallel_intervals( - device_count, self.tp_degree): - self.shard_lmi_dist_model( - input_dir=self.INPUT_MODEL_DIRECTORY, - output_dir=self.OUTPUT_MODEL_DIRECTORY, - pp_degree=self.pp_degree, - tp_degree=self.tp_degree, - chunk_mb=CHUNK_MB, - target_pp_rank=pp_rank, - target_tp_rank_interval=tp_interval) - if comms.rank == 0: - self.save_configs(input_dir=self.INPUT_MODEL_DIRECTORY, - output_dir=self.OUTPUT_MODEL_DIRECTORY) - - except Exception as exc: - raise OptimizationFatalError( - f"Encountered an error during sharding: {exc}") - - -def main(): - logging.basicConfig(stream=sys.stdout, - format="%(message)s", - level=logging.INFO, - force=True) - - try: - neo_sharding_service = NeoShardingService() - neo_sharding_service.run_sharding() - neo_sharding_service.generate_properties_file() - - except Exception as exc: - MPI.COMM_WORLD.Barrier() - write_error_to_file(exc, neo_sharding_service.COMPILATION_ERROR_FILE) - raise exc - finally: - MPI.Finalize() - - -if __name__ == "__main__": - main() diff --git a/serving/docs/lmi/user_guides/lmi_input_output_schema.md b/serving/docs/lmi/user_guides/lmi_input_output_schema.md index 59f32ee87..d5e34f8e0 100644 --- a/serving/docs/lmi/user_guides/lmi_input_output_schema.md +++ b/serving/docs/lmi/user_guides/lmi_input_output_schema.md @@ -294,33 +294,6 @@ To enable sampling in LMI <= 0.31.0, you must specify `do_sample: true` in addit This behavior will change starting LMI 0.32.0 where you will no longer be required to set `do_sample`, it will be inferred from the other sampling parameters. -#### Additional LMI Dist Generation parameters - -``` -LmiDistRollingBatchParameters : { - 'typical_p' : float (default= 1.0), - 'truncate' : integer (default = None), - 'ignore_eos_token' : boolean (default = false), - 'top_k' : integer (default = -1), - 'min_p': float (default = 0.0), - 'presence_penalty': float (default = 0.0), - 'frequency_penalty' : float (default = 0.0), - 'n': integer (default = 1), (set this greater than 1 to get mutliple sequences. only works with non-streaming case) - 'best_of': integer (default = 1) - 'num_beams': integer (default = 1), (set this greater than 1 to enable beam search. only works with non-streaming case) - 'length_penalty' : float (default = 1.0), - 'early_stopping' : boolean (default = false), - 'stop_token_ids': list (default = None), - 'include_stop_str_in_output' : boolean (default = false), - 'ignore_eos_token' : boolean (default = false), - 'logprobs' : int (default = None), - 'prompt_logprobs' : int (default = None), - 'skip_special_tokens': boolean (default = true), - 'spaces_between_special_tokens': boolean (default = true), -} -``` - -Decoding methods supported in LmiDist : Greedy (Default) and Sampling. #### Additional vLLM Generation Parameters diff --git a/tests/integration/benchmark/ir-llm/README.md b/tests/integration/benchmark/ir-llm/README.md index 7c47a47d8..105c89880 100644 --- a/tests/integration/benchmark/ir-llm/README.md +++ b/tests/integration/benchmark/ir-llm/README.md @@ -23,12 +23,10 @@ The config.yml file defines the overall benchmark configuration, including: * benchmark_report_s3_location: The S3 location where the benchmark reports will be stored. * model_test_cases: A list of model test cases to be benchmarked. -An example can be found at: ./config/lmi-dist/config.yml ### benchmark_config_xxx.json The xxx.json files in the configs directory define the IR-LLM configuration for each model's test case. -An example can be found at: ./config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-fp8.json ## Benchmark Reports After running the benchmark, the reports will be saved in the specified S3 location. The reports will contain detailed metrics and performance data for each benchmarked model test case. \ No newline at end of file diff --git a/tests/integration/benchmark/ir-llm/config/lmi-dist/config.yml b/tests/integration/benchmark/ir-llm/config/lmi-dist/config.yml deleted file mode 100644 index 5a6fcc2ea..000000000 --- a/tests/integration/benchmark/ir-llm/config/lmi-dist/config.yml +++ /dev/null @@ -1,133 +0,0 @@ -region: "us-west-2" - -cloudwatch: - metrics_namespace: "SageMaker_LLM_Benchmark" - -s3: - bucket_name: "djl-benchmark-llm" - folder: "sm-lmi-dist" - -metrics: - timeToFirstToken_p50: - metric_name: "TTFT_P50" - unit: "Milliseconds" - - timeToFirstToken_p99: - metric_name: "TTFT_P99" - unit: "Milliseconds" - - intertokenLatency_p50: - metric_name: "InterTokenLatency_P50" - unit: "Milliseconds" - - intertokenLatency_p99: - metric_name: "InterTokenLatency_P99" - unit: "Milliseconds" - - costPerMillionInputTokens: - metric_name: "CostPerMillionInputTokens" - unit: "Count" - - costPerMillionOutputTokens: - metric_name: "CostPerMillionOutputTokens" - unit: "None" - - tokenizerFailed_Sum: - metric_name: "TokenizerErrorRate" - unit: "Percent" - - numberOfInputTokens_p50: - metric_name: "NumberOfInputTokens_p50" - unit: "None" - - numberOfInputTokens_p99: - metric_name: "NumberOfInputTokens_p99" - unit: "None" - - numberOfOutputTokens_p50: - metric_name: "NumberOfOutputTokens_p50" - unit: "None" - - numberOfOutputTokens_p99: - metric_name: "NumberOfOutputTokens_p99" - unit: "None" - - clientInvocationErrors_Sum: - metric_name: "ClientInvocationErrorRate" - unit: "Percent" - - emptyInferenceResponse_Sum: - metric_name: "EmptyInferenceResponseRate" - unit: "Percent" - -benchmarks: - - model: "Llama-3.1-8b" - endpoints: - - endpoint: "sagemaker" - image: "LMI-dist" - config: "benchmark_config_passive_Llama-3-1-8b.json" - dataset: "s3://djl-benchmark-llm-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz" - action: yes - - model: "Llama-3.1-8b-suzuka" - endpoints: - - endpoint: "sagemaker" - image: "LMI-dist" - config: "benchmark_config_LMI_V12_Llama-3-1-8b-suzuka.json" - dataset: "s3://djl-benchmark-llm-datasets/openorca/openorca_base_payload_en_500-1000.tar.gz" - action: no - - model: "Llama-3.1-8b-instruct" - endpoints: - - endpoint: "sagemaker" - image: "LMI-dist" - config: "benchmark_config_passive_Llama-3-1-8b-instruct.json" - dataset: "s3://djl-benchmark-llm-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz" - action: yes - - model: "Llama-3.1-8b-instruct-suzuka" - endpoints: - - endpoint: "sagemaker" - image: "LMI-dist" - config: "benchmark_config_LMI_V12_Llama-3-1-8b-instruct-suzuka.json" - dataset: "s3://djl-benchmark-llm-datasets/openorca/openorca_instruct_payload_en_500-1000.tar.gz" - action: no - - model: "Llama-3.1-70b" - endpoints: - - endpoint: "sagemaker" - image: "LMI-dist" - config: "benchmark_config_passive_Llama-3-1-70b.json" - dataset: "s3://djl-benchmark-llm-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz" - action: yes - - model: "Llama-3.1-70b-suzuka" - endpoints: - - endpoint: "sagemaker" - image: "LMI-dist" - config: "benchmark_config_LMI_V12_Llama-3-1-70b-suzuka.json" - dataset: "s3://djl-benchmark-llm-datasets/openorca/openorca_base_payload_en_500-1000.tar.gz" - action: no - - model: "Llama-3.1-70b-instruct" - endpoints: - - endpoint: "sagemaker" - image: "LMI-dist" - config: "benchmark_config_passive_Llama-3-1-70b-instruct.json" - dataset: "s3://djl-benchmark-llm-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz" - action: yes - - model: "Llama-3.1-70b-instruct-suzuka" - endpoints: - - endpoint: "sagemaker" - image: "LMI-dist" - config: "benchmark_config_LMI_V12_Llama-3-1-70b-instruct-suzuka.json" - dataset: "s3://djl-benchmark-llm-datasets/openorca/openorca_instruct_payload_en_500-1000.tar.gz" - action: no - - model: "Llama-3.1-405b-fp8" - endpoints: - - endpoint: "sagemaker" - image: "LMI-dist" - config: "benchmark_config_passive_Llama-3-1-405b-fp8.json" - dataset: "s3://djl-benchmark-llm-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz" - action: yes - - model: "Llama-3.1-405b-instruct-fp8" - endpoints: - - endpoint: "sagemaker" - image: "LMI-dist" - config: "benchmark_config_passive_Llama-3-1-405b-instruct-fp8.json" - dataset: "s3://djl-benchmark-llm-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz" - action: yes diff --git a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-fp8.json b/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-fp8.json deleted file mode 100644 index e52cce3a0..000000000 --- a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-fp8.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "tokenizer_model_id": "meta-llama/Meta-Llama-3.1-405B-FP8", - "jumpstart_model_id": "meta-textgeneration-llama-3-1-405b-fp8", - "use_jumpstart_prod_artifact": true, - "image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly", - "image_uri_args": { - "framework": "djl-lmi", - "version": "nightly" - }, - "model_args": { - "env": { - "HF_MODEL_ID": "/opt/ml/model/", - "OPTION_MAX_MODEL_LEN": "8192", - "OPTION_USE_PASSIVE_WORKERS": "true" - }, - "enable_network_isolation": true - }, - "benchmark_configurations": [ - { - "instance_type": "ml.p4d.24xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "8" - ] - } - }, - { - "instance_type": "ml.p5.48xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "8" - ] - } - } - ] -} \ No newline at end of file diff --git a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-instruct-fp8.json b/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-instruct-fp8.json deleted file mode 100644 index 880c8d028..000000000 --- a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-instruct-fp8.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "tokenizer_model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8", - "jumpstart_model_id": "meta-textgeneration-llama-3-1-405b-instruct-fp8", - "use_jumpstart_prod_artifact": true, - "image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly", - "image_uri_args": { - "framework": "djl-lmi", - "version": "nightly" - }, - "model_args": { - "env": { - "HF_MODEL_ID": "/opt/ml/model/", - "OPTION_MAX_MODEL_LEN": "8192", - "OPTION_USE_PASSIVE_WORKERS": "true" - }, - "enable_network_isolation": true - }, - "benchmark_configurations": [ - { - "instance_type": "ml.p5.48xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "8" - ] - } - } - ] -} \ No newline at end of file diff --git a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-70b-instruct.json b/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-70b-instruct.json deleted file mode 100644 index a1663dc68..000000000 --- a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-70b-instruct.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "tokenizer_model_id": "meta-llama/Meta-Llama-3-70B-Instruct", - "jumpstart_model_id": "meta-textgeneration-llama-3-1-70b-instruct", - "use_jumpstart_prod_artifact": true, - "image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly", - "image_uri_args": { - "framework": "djl-lmi", - "version": "nightly" - }, - "model_args": { - "env": { - "HF_MODEL_ID": "/opt/ml/model/", - "OPTION_MAX_MODEL_LEN": "8192", - "OPTION_USE_PASSIVE_WORKERS": "true" - }, - "enable_network_isolation": true - }, - "benchmark_configurations": [ - { - "instance_type": "ml.g5.48xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "8" - ] - } - }, - { - "instance_type": "ml.g6.48xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "8" - ] - } - }, - { - "instance_type": "ml.p4d.24xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "8" - ] - } - }, - { - "instance_type": "ml.p5.48xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "8" - ] - } - } - ] -} \ No newline at end of file diff --git a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-70b.json b/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-70b.json deleted file mode 100644 index 4536d7b52..000000000 --- a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-70b.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "tokenizer_model_id": "meta-llama/Meta-Llama-3-70B", - "jumpstart_model_id": "meta-textgeneration-llama-3-1-70b", - "use_jumpstart_prod_artifact": true, - "image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly", - "image_uri_args": { - "framework": "djl-lmi", - "version": "nightly" - }, - "model_args": { - "env": { - "HF_MODEL_ID": "/opt/ml/model/", - "OPTION_MAX_MODEL_LEN": "8192", - "OPTION_USE_PASSIVE_WORKERS": "true" - }, - "enable_network_isolation": true - }, - "benchmark_configurations": [ - { - "instance_type": "ml.g5.48xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "8" - ] - } - }, - { - "instance_type": "ml.g6.48xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "8" - ] - } - }, - { - "instance_type": "ml.p4d.24xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "8" - ] - } - }, - { - "instance_type": "ml.p5.48xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "8" - ] - } - } - ] -} \ No newline at end of file diff --git a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-8b-instruct.json b/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-8b-instruct.json deleted file mode 100644 index 191d37887..000000000 --- a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-8b-instruct.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "tokenizer_model_id": "meta-llama/Meta-Llama-3-8B-Instruct", - "jumpstart_model_id": "meta-textgeneration-llama-3-1-8b-instruct", - "use_jumpstart_prod_artifact": true, - "image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly", - "image_uri_args": { - "framework": "djl-lmi", - "version": "nightly" - }, - "model_args": { - "env": { - "HF_MODEL_ID": "/opt/ml/model/", - "OPTION_MAX_MODEL_LEN": "8192", - "OPTION_USE_PASSIVE_WORKERS": "true" - }, - "enable_network_isolation": true - }, - "benchmark_configurations": [ - { - "instance_type": "ml.g5.12xlarge", - "env_params": { - "OPTION_GPU_MEMORY_UTILIZATION": [ - "0.85" - ] - } - }, - { - "instance_type": "ml.g6.12xlarge", - "env_params": { - "OPTION_GPU_MEMORY_UTILIZATION": [ - "0.85" - ] - } - }, - { - "instance_type": "ml.p4d.24xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "1" - ] - } - }, - { - "instance_type": "ml.p5.48xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "1" - ], - "OPTION_GPU_MEMORY_UTILIZATION": [ - "0.95" - ] - } - } - ] -} \ No newline at end of file diff --git a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-8b.json b/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-8b.json deleted file mode 100644 index 4c6fdb157..000000000 --- a/tests/integration/benchmark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-8b.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "tokenizer_model_id": "meta-llama/Meta-Llama-3-8B", - "jumpstart_model_id": "meta-textgeneration-llama-3-1-8b", - "use_jumpstart_prod_artifact": true, - "image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly", - "image_uri_args": { - "framework": "djl-lmi", - "version": "nightly" - }, - "model_args": { - "env": { - "HF_MODEL_ID": "/opt/ml/model/", - "OPTION_MAX_MODEL_LEN": "8192", - "OPTION_USE_PASSIVE_WORKERS": "true" - }, - "enable_network_isolation": true - }, - "benchmark_configurations": [ - { - "instance_type": "ml.g5.12xlarge", - "env_params": { - "OPTION_GPU_MEMORY_UTILIZATION": [ - "0.85" - ] - } - }, - { - "instance_type": "ml.g6.12xlarge", - "env_params": { - "OPTION_GPU_MEMORY_UTILIZATION": [ - "0.85" - ] - } - }, - { - "instance_type": "ml.p4d.24xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "1" - ] - } - }, - { - "instance_type": "ml.p5.48xlarge", - "env_params": { - "TENSOR_PARALLEL_DEGREE": [ - "1" - ], - "OPTION_GPU_MEMORY_UTILIZATION": [ - "0.95" - ] - } - } - ] -} \ No newline at end of file diff --git a/tests/integration/benchmark/ir-llm/scripts/cw_metrics.py b/tests/integration/benchmark/ir-llm/scripts/cw_metrics.py index 8f574a26c..c4e31a8c9 100644 --- a/tests/integration/benchmark/ir-llm/scripts/cw_metrics.py +++ b/tests/integration/benchmark/ir-llm/scripts/cw_metrics.py @@ -93,7 +93,7 @@ def run_benchmark(config_yml, benchmark_config_dir, benchmark_metric_dir): metrics = config.get("metrics", {}) hf_token = os.getenv("HF_TOKEN", "") s3_bucket = config.get("s3", {}).get("bucket_name", "djl-benchmark") - s3_folder = config.get("s3", {}).get("folder", "lmi-dist") + s3_folder = config.get("s3", {}).get("folder", "vllm") current_date = datetime.now().strftime("%Y-%m-%d") s3_metrics_folder = f"{current_date}/{s3_folder}/metrics/" s3_config_folder = f"{current_date}/{s3_folder}/config/" diff --git a/tests/integration/benchmark/nightly/g5-12xl.txt b/tests/integration/benchmark/nightly/g5-12xl.txt index cd73232d2..a2da13e31 100644 --- a/tests/integration/benchmark/nightly/g5-12xl.txt +++ b/tests/integration/benchmark/nightly/g5-12xl.txt @@ -1,7 +1,7 @@ [test_name] llama2 [vars] -ENGINE={vllm,lmi-dist} +ENGINE={vllm} [container] deepjavalibrary/djl-serving:lmi-nightly [serving_properties] @@ -19,7 +19,7 @@ TOKENIZER=TheBloke/Llama-2-7B-fp16 ./awscurl -c 32 -N 10 \ [test_name] llama3 [vars] -ENGINE={vllm,lmi-dist} +ENGINE={vllm} [container] deepjavalibrary/djl-serving:lmi-nightly [serving_properties] diff --git a/tests/integration/benchmark/nightly/g5-2xl.txt b/tests/integration/benchmark/nightly/g5-2xl.txt index 88ead4b1c..545d553bb 100644 --- a/tests/integration/benchmark/nightly/g5-2xl.txt +++ b/tests/integration/benchmark/nightly/g5-2xl.txt @@ -1,7 +1,7 @@ [test_name] mistral [vars] -ENGINE={vllm,lmi-dist} +ENGINE={vllm} [benchmark_vars] CONCURRENCY={1,2,4,8} [container] diff --git a/tests/integration/benchmark/nightly/g5-48xl.txt b/tests/integration/benchmark/nightly/g5-48xl.txt index e2749edab..0a7a4f980 100644 --- a/tests/integration/benchmark/nightly/g5-48xl.txt +++ b/tests/integration/benchmark/nightly/g5-48xl.txt @@ -1,7 +1,7 @@ [test_name] mixtral-8x7b [vars] -ENGINE={vllm,lmi-dist} +ENGINE={vllm} [container] deepjavalibrary/djl-serving:lmi-nightly [serving_properties] diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index bf6604b16..0d7b0caf2 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -220,228 +220,6 @@ def get_model_name(): }, } -lmi_dist_model_spec = { - "gpt-neox-20b": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "EleutherAI/gpt-neox-20b" - }, - "falcon-7b": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "tiiuae/falcon-7b" - }, - "falcon-11b": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "tiiuae/falcon-11B" - }, - "flan-t5-xxl": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "google/flan-t5-xxl" - }, - "gpt2": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "gpt2" - }, - "mpt-7b": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "mosaicml/mpt-7b" - }, - "octocoder": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "bigcode/octocoder" - }, - "speculative-llama-13b": { - "batch_size": [1, 8], - "seq_length": [256], - "tokenizer": "TheBloke/Llama-2-13B-fp16" - }, - "starcoder2-7b": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "bigcode/starcoder2-7b" - }, - "gemma-7b": { - "batch_size": [1, 4], - "seq_length": [256] - }, - "gemma-2b": { - "batch_size": [1, 4], - "seq_length": [256], - }, - "llama2-13b-gptq": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "TheBloke/Llama-2-13B-fp16", - "parameters": { - "decoder_input_details": True - }, - "stream": [False], - }, - "mistral-7b": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "amazon/MegaBeam-Mistral-7B-300k" - }, - "llama3-8b-chunked-prefill": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "TheBloke/Llama-2-7B-Chat-fp16" - }, - "falcon-11b-chunked-prefill": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "tiiuae/falcon-11B" - }, - "llama2-7b-32k": { - "batch_size": [1, 4], - "seq_length": [1024], - "tokenizer": "TheBloke/Llama-2-13B-fp16", - "parameters": { - "decoder_input_details": True - }, - "stream": [False], - }, - "mistral-7b-128k-awq": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "amazon/MegaBeam-Mistral-7B-300k" - }, - "mistral-7b-marlin": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "amazon/MegaBeam-Mistral-7B-300k" - }, - "llama-2-13b-flashinfer": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "TheBloke/Llama-2-13B-fp16", - }, - "llama-7b-unmerged-lora": { - "batch_size": [3], - "seq_length": [16, 32], - "worker": 1, - "adapters": ["english-alpaca", "portugese-alpaca", "english-alpaca"], - "tokenizer": "TheBloke/Llama-2-7B-fp16" - }, - "llama-7b-unmerged-lora-overflow": { - "batch_size": [4], - "seq_length": [16, 32], - "worker": 1, - "adapters": [f"english-alpaca-{i}" for i in range(20)], - "tokenizer": "TheBloke/Llama-2-7B-fp16" - }, - "llama2-13b-awq-unmerged-lora": { - "batch_size": [4], - "seq_length": [16, 32], - "worker": 1, - "adapters": ["french", "spanish"], - "tokenizer": "TheBloke/Llama-2-13B-fp16" - }, - "mistral-7b-unmerged-lora": { - "batch_size": [4], - "seq_length": [16, 32], - "worker": 1, - "adapters": ["spanish", "german"], - "tokenizer": "unsloth/mistral-7b-instruct-v0.2" - }, - "mistral-7b-awq-unmerged-lora": { - "batch_size": [4], - "seq_length": [16, 32], - "worker": 1, - "adapters": ["spanish", "german"], - "tokenizer": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ" - }, - "mistral-7b-gptq-unmerged-lora": { - "batch_size": [4], - "seq_length": [16, 32], - "worker": 1, - "adapters": ["spanish", "german"], - "tokenizer": "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ" - }, - "llama3-8b-unmerged-lora": { - "batch_size": [4], - "seq_length": [16, 32], - "worker": 1, - "adapters": ["french", "spanish"], - "tokenizer": "unsloth/llama-3-8b-Instruct" - }, - "gemma-7b-unmerged-lora": { - "batch_size": [4], - "seq_length": [16, 32], - "worker": 1, - "adapters": ["alpaca", "dante"], - "tokenizer": "unsloth/gemma-7b" - }, - "phi2-unmerged-lora": { - "batch_size": [4], - "seq_length": [16, 32], - "worker": 1, - "adapters": ["sql", "bunny"], - "tokenizer": "microsoft/phi-2" - }, - "llama-2-tiny": { - "batch_size": [1, 4], - "seq_length": [256] - }, - "llama3-8b": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "TheBloke/Llama-3-8B-fp16" - }, - "llama-3.1-8b": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "NousResearch/Hermes-3-Llama-3.1-8B" - }, - "llama32-3b-multi-worker-tp1-pp1": { - "batch_size": [1, 4], - "seq_length": [256], - }, - "llama32-3b-multi-worker-tp2-pp1": { - "batch_size": [1, 4], - "seq_length": [256], - }, - "llama32-3b-multi-worker-tp1-pp2": { - "batch_size": [1, 4], - "seq_length": [256], - }, - "llama31-8b-pp-only": { - "batch_size": [1, 4], - "seq_length": [256], - }, - "llama31-8b-tp2-pp2": { - "batch_size": [1, 4], - "seq_length": [256], - }, - "llama31-8b-tp2-pp2-spec-dec": { - "batch_size": [1, 4], - "seq_length": [256], - }, - "flan-t5-xl": { - "batch_size": [1, 4], - "seq_length": [256], - }, - "tinyllama-input-len-exceeded": { - "batch_size": [1], - "seq_length": [25], - "tokenizer": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - }, -} - -lmi_dist_chat_model_spec = { - "llama2-7b-chat": { - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "TheBloke/Llama-2-7B-Chat-fp16" - } -} - vllm_model_spec = { "gpt-neox-20b": { "batch_size": [1, 4], @@ -655,29 +433,6 @@ def get_model_name(): }, } -lmi_dist_aiccl_model_spec = { - "llama-2-70b-aiccl": { - "batch_size": [1, 8], - "seq_length": [256], - "tokenizer": "TheBloke/Llama-2-13B-fp16" - }, - "codellama-34b-aiccl": { - "batch_size": [1, 8], - "seq_length": [256], - "tokenizer": "codellama/CodeLlama-34b-hf" - }, - "falcon-40b-aiccl": { - "batch_size": [1, 8], - "seq_length": [256], - "tokenizer": "tiiuae/falcon-40b" - }, - "mixtral-8x7b-aiccl": { - "batch_size": [1, 8], - "seq_length": [256], - "tokenizer": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO" - }, -} - trtllm_model_spec = { "llama2-13b": { "batch_size": [1, 4], @@ -856,17 +611,6 @@ def get_model_name(): "return_full_text": True } }, - "lmi-dist-codestral-22b": { - "batch_size": [41], - "seq_length": [512], - "num_run": 4, - "tokenizer": "bullerwins/Codestral-22B-v0.1-hf", - "dataset": "humaneval", - "score": 0.5, - "parameters": { - "return_full_text": True - } - }, "neuronx-codestral-22b": { "batch_size": [41], "seq_length": [512], @@ -883,14 +627,6 @@ def get_model_name(): "dataset": "mmlu", "score": 0.6 }, - "lmi-dist-llama3-1-8b": { - "batch_size": [213], - "seq_length": [1], - "num_run": 66, - "tokenizer": "TheBloke/Llama-2-7B-fp16", - "dataset": "mmlu", - "score": 0.6 - }, "neuronx-llama3-2-1b": { "batch_size": [32], "seq_length": [1], @@ -2348,10 +2084,7 @@ def run(raw_args): elif args.handler == "transformers_neuronx_neo_rolling_batch": test_handler_rolling_batch(args.model, transformers_neuronx_neo_model_spec) - elif args.handler == "lmi_dist": - test_handler_rolling_batch(args.model, lmi_dist_model_spec) - elif args.handler == "lmi_dist_adapters": - test_handler_adapters(args.model, lmi_dist_model_spec) + elif args.handler == "vllm": test_handler_rolling_batch(args.model, vllm_model_spec) elif args.handler == "custom": @@ -2362,8 +2095,7 @@ def run(raw_args): test_handler_adapters(args.model, vllm_model_spec) elif args.handler == "vllm_async_adapters": test_handler_adapters(args.model, vllm_model_spec) - elif args.handler == "lmi_dist_chat": - test_handler_rolling_batch_chat(args.model, lmi_dist_chat_model_spec) + elif args.handler == "vllm_chat": test_handler_rolling_batch_chat(args.model, vllm_chat_model_spec) elif args.handler == "vllm_tool": @@ -2374,8 +2106,7 @@ def run(raw_args): test_handler_performance(args.model, handler_performance_model_spec) elif args.handler == "performance": test_performance() - elif args.handler == "lmi_dist_aiccl": - test_handler_rolling_batch(args.model, lmi_dist_aiccl_model_spec) + elif args.handler == "trtllm": test_handler_rolling_batch(args.model, trtllm_model_spec) elif args.handler == "trtllm_chat": diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index eddbdae38..9904822fd 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -54,27 +54,7 @@ } } -performance_test_list = { - "open-llama-13b-fp16-lmi-dist": { - "option.task": "text-generation", - "option.dtype": "fp16", - "engine": "MPI", - "option.model_id": "s3://djl-llm/open-llama-13b/", - "option.rolling_batch": "lmi-dist", - }, - "bloom-7b1-fp16-lmi-dist": { - "engine": "MPI", - "option.task": "text-generation", - "option.rolling_batch": "lmi-dist", - }, - "gpt-neox-20b-fp16-lmi-dist": { - "option.task": "text-generation", - "option.dtype": "fp16", - "engine": "MPI", - "option.model_id": "s3://djl-llm/gpt-neox-20b/", - "option.rolling_batch": "lmi-dist", - } -} +performance_test_list = {} transformers_neuronx_handler_list = { "gpt2": { @@ -320,436 +300,6 @@ } } -lmi_dist_model_list = { - "gpt-neox-20b": { - "option.model_id": "s3://djl-llm/gpt-neox-20b", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4 - }, - "falcon-7b": { - "option.model_id": "tiiuae/falcon-7b", - "option.task": "text-generation", - "option.tensor_parallel_degree": 1, - "option.max_rolling_batch_size": 4, - }, - "falcon-11b": { - "option.model_id": "s3://djl-llm/falcon-11B/", - "option.task": "text-generation", - "option.tensor_parallel_degree": 2, - "option.max_rolling_batch_size": 4, - }, - "flan-t5-xxl": { - "option.model_id": "google/flan-t5-xxl", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4 - }, - "gpt2": { - "option.model_id": "gpt2", - "option.task": "text-generation", - "option.tensor_parallel_degree": 1, - "option.max_rolling_batch_size": 2 - }, - "mpt-7b": { - "option.model_id": "mosaicml/mpt-7b", - "option.task": "text-generation", - "option.trust_remote_code": True, - "option.tensor_parallel_degree": 1, - "option.max_rolling_batch_size": 4, - "load_on_devices": 0 - }, - "octocoder": { - "option.model_id": "s3://djl-llm/octocoder", - "option.task": "text-generation", - "option.trust_remote_code": True, - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4 - }, - "speculative-llama-13b": { - "option.model_id": "s3://djl-llm/llama-2-13b-hf/", - "option.speculative_draft_model": "s3://djl-llm/tinyllama-1.1b-chat/", - "option.gpu_memory_utilization": "0.8", - "option.tensor_parallel_degree": "max", - }, - "starcoder2-7b": { - "option.model_id": "s3://djl-llm/bigcode-starcoder2", - "option.task": "text-generation", - "option.trust_remote_code": True, - "option.tensor_parallel_degree": 1, - "option.max_rolling_batch_size": 4, - }, - "gemma-7b": { - "option.model_id": "s3://djl-llm/gemma-7b", - "option.task": "text-generation", - "option.trust_remote_code": True, - "option.tensor_parallel_degree": 1, - "option.max_rolling_batch_size": 4, - "option.max_model_len": 2656, - }, - "gemma-2b": { - "option.model_id": "s3://djl-llm/gemma-2b", - "option.task": "text-generation", - "option.trust_remote_code": True, - "option.tensor_parallel_degree": 1, - "option.max_rolling_batch_size": 256, - }, - "llama2-13b-gptq": { - "option.model_id": "s3://djl-llm/TheBloke-Llama-2-13b-Chat-GPTQ/", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4, - "option.quantize": "gptq" - }, - "mistral-7b": { - "option.model_id": "s3://djl-llm/mistral-7b", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4 - }, - # TODO: Adding max_model_len due to changes mem profiling - # for RoPE scaling models in vLLM - "llama2-7b-32k": { - "option.model_id": "togethercomputer/LLaMA-2-7B-32K", - "option.task": "text-generation", - "option.tensor_parallel_degree": 2, - "option.max_rolling_batch_size": 4, - "option.max_model_len": 51888, - }, - "mistral-7b-128k-awq": { - "option.model_id": "TheBloke/Yarn-Mistral-7B-128k-AWQ", - "option.task": "text-generation", - "option.tensor_parallel_degree": 2, - "option.max_rolling_batch_size": 4, - "option.max_model_len": 32768, - "option.quantize": "awq" - }, - "mistral-7b-marlin": { - "option.model_id": "neuralmagic/OpenHermes-2.5-Mistral-7B-marlin", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4, - "option.quantize": "marlin" - }, - "llama-2-13b-flashinfer": { - "option.model_id": "s3://djl-llm/llama-2-13b-hf/", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4, - }, - "llama3-8b": { - "option.model_id": "s3://djl-llm/llama-3-8b-hf/", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - }, - "llama3-8b-chunked-prefill": { - "option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.enable_chunked_prefill": "true", - }, - "falcon-11b-chunked-prefill": { - "option.model_id": "s3://djl-llm/falcon-11B/", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.enable_chunked_prefill": "true", - }, - "llama-7b-unmerged-lora": { - "option.model_id": "s3://djl-llm/huggyllama-llama-7b", - "option.tensor_parallel_degree": "max", - "option.enable_lora": "true", - "option.max_loras": 2, - "option.max_lora_rank": 16, - "option.long_lora_scaling_factors": "4.0", - "option.adapters": "adapters", - "adapter_ids": ["tloen/alpaca-lora-7b", "22h/cabrita-lora-v0-1"], - "adapter_names": ["english-alpaca", "portugese-alpaca"], - "option.gpu_memory_utilization": "0.8", - }, - "llama-7b-unmerged-lora-overflow": { - "option.model_id": "s3://djl-llm/huggyllama-llama-7b", - "option.tensor_parallel_degree": "max", - "option.enable_lora": "true", - "option.max_loras": 6, - "option.max_cpu_loras": 8, - "option.adapters": "adapters", - "adapter_ids": ["tloen/alpaca-lora-7b"] * 20, - "adapter_names": [f"english-alpaca-{i}" for i in range(20)], - "option.gpu_memory_utilization": "0.8", - }, - "llama2-7b-chat": { - "option.model_id": "s3://djl-llm/meta-llama-Llama-2-7b-chat-hf/", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4 - }, - "llama2-13b-awq-unmerged-lora": { - "option.model_id": - "s3://djl-llm/TheBloke-Llama-2-13b-Chat-AWQ/", - "option.tensor_parallel_degree": - "max", - "option.quantize": - "awq", - "option.enable_lora": - "true", - "option.max_loras": - 2, - "option.max_lora_rank": - 64, - "option.long_lora_scaling_factors": - "4.0", - "option.adapters": - "adapters", - "adapter_ids": [ - "UnderstandLing/llama-2-13b-chat-fr", - "UnderstandLing/llama-2-13b-chat-es" - ], - "adapter_names": ["french", "spanish"], - "option.gpu_memory_utilization": - "0.8", - }, - "mistral-7b-unmerged-lora": { - "option.model_id": - "s3://djl-llm/mistral-7b-instruct-v02/", - "option.tensor_parallel_degree": - "max", - "option.enable_lora": - "true", - "option.max_loras": - 2, - "option.max_lora_rank": - 64, - "option.long_lora_scaling_factors": - "4.0", - "option.adapters": - "adapters", - "adapter_ids": [ - "UnderstandLing/Mistral-7B-Instruct-v0.2-es", - "UnderstandLing/Mistral-7B-Instruct-v0.2-de" - ], - "adapter_names": ["spanish", "german"], - "option.gpu_memory_utilization": - "0.8", - }, - "mistral-7b-awq-unmerged-lora": { - "option.model_id": - "s3://djl-llm/mistral-7b-instruct-v02-awq/", - "option.tensor_parallel_degree": - "max", - "option.quantize": - "awq", - "option.enable_lora": - "true", - "option.max_loras": - 2, - "option.max_lora_rank": - 64, - "option.long_lora_scaling_factors": - "4.0", - "option.lora_dtype": - "float16", - "option.adapters": - "adapters", - "adapter_ids": [ - "UnderstandLing/Mistral-7B-Instruct-v0.2-es", - "UnderstandLing/Mistral-7B-Instruct-v0.2-de" - ], - "adapter_names": ["spanish", "german"], - "option.gpu_memory_utilization": - "0.8", - }, - "mistral-7b-gptq-unmerged-lora": { - "option.model_id": - "s3://djl-llm/mistral-7b-instruct-v02-gptq/", - "option.tensor_parallel_degree": - "max", - "option.quantize": - "gptq", - "option.dtype": - "fp16", - "option.enable_lora": - "true", - "option.max_loras": - 2, - "option.max_lora_rank": - 64, - "option.long_lora_scaling_factors": - "4.0", - "option.lora_dtype": - "float16", - "option.adapters": - "adapters", - "adapter_ids": [ - "UnderstandLing/Mistral-7B-Instruct-v0.2-es", - "UnderstandLing/Mistral-7B-Instruct-v0.2-de" - ], - "adapter_names": ["spanish", "german"], - "option.gpu_memory_utilization": - "0.8", - }, - "llama3-8b-unmerged-lora": { - "option.model_id": - "s3://djl-llm/llama-3-8b-instruct-hf/", - "option.tensor_parallel_degree": - "max", - "option.enable_lora": - "true", - "option.max_loras": - 2, - "option.max_lora_rank": - 64, - "option.long_lora_scaling_factors": - "4.0", - "option.adapters": - "adapters", - "adapter_ids": [ - "UnderstandLing/Llama-3-8B-Instruct-fr", - "UnderstandLing/Llama-3-8B-Instruct-es", - ], - "adapter_names": ["french", "spanish"], - "option.gpu_memory_utilization": - "0.8", - }, - "gemma-7b-unmerged-lora": { - "option.model_id": - "s3://djl-llm/gemma-7b/", - "option.tensor_parallel_degree": - "max", - "option.enable_lora": - "true", - "option.max_loras": - 1, - "option.max_lora_rank": - 64, - "option.long_lora_scaling_factors": - "4.0", - "option.adapters": - "adapters", - "adapter_ids": [ - "Chuanming/Alpaca-Gemma-7b-lora", - "girtcius/gemma-7b-dante-lora", - ], - "adapter_names": ["alpaca", "dante"], - "option.gpu_memory_utilization": - "0.8", - }, - "phi2-unmerged-lora": { - "option.model_id": - "s3://djl-llm/phi-2/", - "option.tensor_parallel_degree": - "max", - "option.enable_lora": - "true", - "option.max_loras": - 1, - "option.max_lora_rank": - 128, - "option.long_lora_scaling_factors": - "4.0", - "option.adapters": - "adapters", - "adapter_ids": [ - "isotr0py/phi-2-test-sql-lora", - "BAAI/bunny-phi-2-siglip-lora", - ], - "adapter_names": ["sql", "bunny"], - "option.gpu_memory_utilization": - "0.8", - }, - "llama-2-tiny": { - "option.model_id": "s3://djl-llm/llama-2-tiny/", - "option.quantize": "awq", - "option.tensor_parallel_degree": 4, - "option.device_map": "auto" - }, - "llama-3.1-8b": { - "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4 - }, - "llava_v1.6-mistral": { - "option.model_id": "s3://djl-llm/llava-v1.6-mistral-7b-hf/", - "option.limit_mm_per_prompt": '{"image": 4}', - }, - "paligemma-3b-mix-448": { - "option.model_id": "s3://djl-llm/paligemma-3b-mix-448/", - "option.tensor_parallel_degree": 1, - }, - "phi-3-vision-128k-instruct": { - "option.model_id": "s3://djl-llm/phi-3-vision-128k-instruct/", - "option.limit_mm_per_prompt": '{"image": 4}', - "option.trust_remote_code": True, - "option.max_model_len": 8192, - }, - "pixtral-12b": { - "option.model_id": "s3://djl-llm/pixtral-12b-2409/", - "option.max_model_len": 8192, - "option.max_rolling_batch_size": 16, - "option.tokenizer_mode": "mistral", - "option.limit_mm_per_prompt": '{"image": 4}', - }, - "llama32-11b-multimodal": { - "option.model_id": "s3://djl-llm/llama-3-2-11b-vision-instruct/", - "option.max_model_len": 8192, - "option.max_rolling_batch_size": 16, - "option.enforce_eager": True, - }, - "llama32-3b-multi-worker-tp1-pp1": { - "option.model_id": "s3://djl-llm/llama-3-2-3b-instruct/", - "option.tensor_parallel_degree": 1, - "option.pipeline_parallel_degree": 1, - "option.max_model_len": 8192, - "option.max_rolling_batch_size": 16, - }, - "llama32-3b-multi-worker-tp2-pp1": { - "option.model_id": "s3://djl-llm/llama-3-2-3b-instruct/", - "option.tensor_parallel_degree": 2, - "option.pipeline_parallel_degree": 1, - "option.max_model_len": 8192, - "option.max_rolling_batch_size": 16, - }, - "llama32-3b-multi-worker-tp1-pp2": { - "option.model_id": "s3://djl-llm/llama-3-2-3b-instruct/", - "option.tensor_parallel_degree": 1, - "option.pipeline_parallel_degree": 2, - "option.max_model_len": 8192, - "option.max_rolling_batch_size": 16, - }, - "llama31-8b-pp-only": { - "option.model_id": "s3://djl-llm/llama-3.1-8b-instruct-hf/", - "option.tensor_parallel_degree": 1, - "option.pipeline_parallel_degree": 4, - "option.max_model_len": 8192, - "option.max_rolling_batch_size": 16, - }, - "llama31-8b-tp2-pp2": { - "option.model_id": "s3://djl-llm/llama-3.1-8b-instruct-hf/", - "option.tensor_parallel_degree": 2, - "option.pipeline_parallel_degree": 2, - "option.max_model_len": 8192, - "option.max_rolling_batch_size": 16, - }, - "llama31-8b-tp2-pp2-spec-dec": { - "option.model_id": "s3://djl-llm/llama-3.1-8b-instruct-hf/", - "option.tensor_parallel_degree": 2, - "option.pipeline_parallel_degree": 2, - "option.max_model_len": 8192, - "option.max_rolling_batch_size": 16, - "option.speculative_draft_model": - "s3://djl-llm/llama-3-2-1b-instruct/", - }, - "flan-t5-xl": { - "option.model_id": "s3://djl-llm/flan-t5-xl/", - }, - "tinyllama-input-len-exceeded": { - "option.model_id": "s3://djl-llm/tinyllama-1.1b-chat/", - "option.max_model_len": "50", - "option.max_rolling_batch_size": "1", - "option.enforce_eager": True, - }, -} - vllm_model_list = { "llama2-13b": { "option.model_id": "OpenAssistant/llama2-13b-orca-8k-3319", @@ -1151,21 +701,6 @@ } } -lmi_dist_aiccl_model_list = { - "llama-2-70b-aiccl": { - "option.model_id": "s3://djl-llm/llama-2-70b-hf/", - }, - "codellama-34b-aiccl": { - "option.model_id": "s3://djl-llm/CodeLlama-34b-Instruct-hf/", - }, - "falcon-40b-aiccl": { - "option.model_id": "s3://djl-llm/falcon-40b/", - }, - "mixtral-8x7b-aiccl": { - "option.model_id": "s3://djl-llm/mixtral-8x7b/", - } -} - trtllm_handler_list = { "llama2-13b": { "option.model_id": "s3://djl-llm/llama-2-13b-hf/", @@ -1269,14 +804,6 @@ "option.tensor_parallel_degree": 4, "option.max_rolling_batch_size": 41 }, - "lmi-dist-codestral-22b": { - "engine": "MPI", - "option.task": "text-generation", - "option.model_id": "bullerwins/Codestral-22B-v0.1-hf", - "option.rolling_batch": "lmi-dist", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 41 - }, "neuronx-codestral-22b": { "engine": "Python", "option.entryPoint": "djl_python.transformers_neuronx", @@ -1296,14 +823,6 @@ "option.tensor_parallel_degree": 4, "option.max_rolling_batch_size": 213 }, - "lmi-dist-llama3-1-8b": { - "engine": "MPI", - "option.task": "text-generation", - "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/", - "option.rolling_batch": "lmi-dist", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 213 - }, "neuronx-llama3-2-1b": { "engine": "Python", "option.entryPoint": "djl_python.transformers_neuronx", @@ -1677,23 +1196,6 @@ def build_transformers_neuronx_handler_model(model): write_model_artifacts(options) -def build_lmi_dist_model(model): - if model not in lmi_dist_model_list.keys(): - raise ValueError( - f"{model} is not one of the supporting handler {list(lmi_dist_model_list.keys())}" - ) - options = lmi_dist_model_list[model] - options["engine"] = "MPI" - options["option.rolling_batch"] = "lmi-dist" - - adapter_ids = options.pop("adapter_ids", []) - adapter_names = options.pop("adapter_names", []) - - write_model_artifacts(options, - adapter_ids=adapter_ids, - adapter_names=adapter_names) - - def build_vllm_async_model(model): if model not in vllm_model_list.keys(): raise ValueError( @@ -1801,20 +1303,6 @@ def build_vllm_neo_model(model): create_neo_input_model(options) -def build_lmi_dist_aiccl_model(model): - if model not in lmi_dist_aiccl_model_list.keys(): - raise ValueError( - f"{model} is not one of the supporting handler {list(lmi_dist_aiccl_model_list.keys())}" - ) - options = lmi_dist_aiccl_model_list[model] - options["engine"] = "MPI" - options["option.task"] = "text-generation" - options["option.tensor_parallel_degree"] = 8 - options["option.rolling_batch"] = "lmi-dist" - options["option.max_rolling_batch_size"] = 16 - write_model_artifacts(options) - - def build_trtllm_handler_model(model): if model not in trtllm_handler_list: raise ValueError( @@ -1896,8 +1384,6 @@ def build_stateful_model(model): 'transformers_neuronx': build_transformers_neuronx_handler_model, 'performance': build_performance_model, 'handler_performance': build_handler_performance_model, - 'lmi_dist': build_lmi_dist_model, - 'lmi_dist_aiccl': build_lmi_dist_aiccl_model, 'vllm': build_vllm_model, 'vllm_neo': build_vllm_neo_model, 'trtllm': build_trtllm_handler_model, diff --git a/tests/integration/profiles/bloom_7b1.json b/tests/integration/profiles/bloom_7b1.json index b3aae9493..2874106cd 100644 --- a/tests/integration/profiles/bloom_7b1.json +++ b/tests/integration/profiles/bloom_7b1.json @@ -1,6 +1,6 @@ { "test_series":"performance", - "model": ["bloom-7b1-fp16-deepspeed","bloom-7b1-bf16-deepspeed", "bloom-7b1-fp16-lmi-dist"], + "model": ["bloom-7b1-fp16-deepspeed","bloom-7b1-bf16-deepspeed"], "tensor_parallel": 4, "batch_size": 1, "in_tokens": [256, 512], diff --git a/tests/integration/profiles/gpt_neox_20b.json b/tests/integration/profiles/gpt_neox_20b.json index 48e8a19fb..73cfe5490 100644 --- a/tests/integration/profiles/gpt_neox_20b.json +++ b/tests/integration/profiles/gpt_neox_20b.json @@ -1,6 +1,6 @@ { "test_series":"performance", - "model": ["gpt-neox-20b-fp16-deepspeed", "gpt-neox-20b-fp16-lmi-dist", "gpt-neox-20b-bf16-deepspeed", "gpt-neox-20b-smoothquant"], + "model": ["gpt-neox-20b-fp16-deepspeed", "gpt-neox-20b-bf16-deepspeed", "gpt-neox-20b-smoothquant"], "tensor_parallel": 4, "batch_size": 1, "in_tokens": [256, 512], diff --git a/tests/integration/profiles/llama_13b.json b/tests/integration/profiles/llama_13b.json index 41090a6ca..c5e57b41c 100644 --- a/tests/integration/profiles/llama_13b.json +++ b/tests/integration/profiles/llama_13b.json @@ -1,6 +1,6 @@ { "test_series":"performance", - "model": ["open-llama-13b-fp16-deepspeed", "open-llama-13b-fp16-lmi-dist", "open-llama-13b-bf16-deepspeed", "open-llama-13b-smoothquant"], + "model": ["open-llama-13b-fp16-deepspeed", "open-llama-13b-bf16-deepspeed", "open-llama-13b-smoothquant"], "tensor_parallel": 4, "batch_size": 1, "in_tokens": [256, 512], diff --git a/tests/integration/profiles/opt_30b.json b/tests/integration/profiles/opt_30b.json index 7f9cdd12a..b36f0266f 100644 --- a/tests/integration/profiles/opt_30b.json +++ b/tests/integration/profiles/opt_30b.json @@ -1,6 +1,6 @@ { "test_series":"performance", - "model": ["opt-30b-fp16-deepspeed", "opt-30b-bf16-deepspeed", "opt-30b-lmi-dist"], + "model": ["opt-30b-fp16-deepspeed", "opt-30b-bf16-deepspeed"], "tensor_parallel": 4, "batch_size": 1, "in_tokens": [256, 512], diff --git a/tests/integration/pytest.ini b/tests/integration/pytest.ini index 1d20edb8e..c3958a271 100644 --- a/tests/integration/pytest.ini +++ b/tests/integration/pytest.ini @@ -9,7 +9,7 @@ markers = cpu: Tests cpu hf: Tests the hf accelerate backend - lmi_dist: Tests the lmi backend + vllm: Tests the vllm backend trtllm: Tests the trtllm backend diff --git a/tests/integration/tests.py b/tests/integration/tests.py index 453b10810..852b38236 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -360,189 +360,6 @@ def test_trtllm_performance(self): client.run("handler_performance trtllm".split()) -@pytest.mark.lmi_dist -@pytest.mark.gpu_4 -class TestLmiDist1: - - def test_gpt_neox_20b(self): - with Runner('lmi', 'gpt-neox-20b') as r: - prepare.build_lmi_dist_model("gpt-neox-20b") - r.launch() - client.run("lmi_dist gpt-neox-20b".split()) - - def test_falcon_7b(self): - with Runner('lmi', 'falcon-7b') as r: - prepare.build_lmi_dist_model("falcon-7b") - r.launch() - client.run("lmi_dist falcon-7b".split()) - - def test_falcon2_11b(self): - with Runner('lmi', 'falcon-11b') as r: - prepare.build_lmi_dist_model("falcon-11b") - r.launch() - client.run("lmi_dist falcon-11b".split()) - - def test_gpt2(self): - with Runner('lmi', 'gpt2') as r: - prepare.build_lmi_dist_model("gpt2") - envs = [ - "OPTION_MAX_ROLLING_BATCH_SIZE=2", - "OPTION_OUTPUT_FORMATTER=jsonlines", - "TENSOR_PARALLEL_DEGREE=1", "OPTION_TASK=text-generation", - "OPTION_ROLLING_BATCH=lmi-dist" - ] - r.launch("\n".join(envs)) - client.run("lmi_dist gpt2".split()) - - def test_mpt_7b(self): - with Runner('lmi', 'mpt-7b') as r: - prepare.build_lmi_dist_model("mpt-7b") - r.launch() - client.run("lmi_dist mpt-7b".split()) - - def test_mistral_7b_marlin(self): - with Runner('lmi', 'mistral-7b-marlin') as r: - prepare.build_lmi_dist_model("mistral-7b-marlin") - r.launch() - client.run("lmi_dist mistral-7b-marlin".split()) - - def test_llama2_13b_flashinfer(self): - with Runner('lmi', 'llama-2-13b-flashinfer') as r: - prepare.build_lmi_dist_model("llama-2-13b-flashinfer") - envs = [ - "VLLM_ATTENTION_BACKEND=FLASHINFER", - ] - r.launch(env_vars=envs) - client.run("lmi_dist llama-2-13b-flashinfer".split()) - - def test_llama2_tiny_autoawq(self): - with Runner('lmi', 'llama-2-tiny-autoawq') as r: - prepare.build_lmi_dist_model("llama-2-tiny") - r.launch( - "CUDA_VISIBLE_DEVICES=0,1,2,3", - cmd= - "partition --model-dir /opt/ml/input/data/training --save-mp-checkpoint-path /opt/ml/input/data/training/aot" - ) - r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3", - cmd="serve -m test=file:/opt/ml/model/test/aot") - client.run("lmi_dist llama-2-tiny".split()) - os.system('sudo rm -rf models') - - def test_llama3_8b_chunked_prefill(self): - with Runner('lmi', 'llama3-8b-chunked-prefill') as r: - prepare.build_lmi_dist_model("llama3-8b-chunked-prefill") - r.launch() - client.run( - "lmi_dist llama3-8b-chunked-prefill --in_tokens 1200".split()) - - def test_falcon_11b_chunked_prefill(self): - with Runner('lmi', 'falcon-11b-chunked-prefill') as r: - prepare.build_lmi_dist_model("falcon-11b-chunked-prefill") - r.launch() - client.run( - "lmi_dist falcon-11b-chunked-prefill --in_tokens 1200".split()) - - def test_flan_t5_xl(self): - with Runner('lmi', 'flan-t5-xl') as r: - prepare.build_lmi_dist_model("flan-t5-xl") - r.launch() - client.run("lmi_dist flan-t5-xl".split()) - - -@pytest.mark.lmi_dist -@pytest.mark.gpu_4 -class TestLmiDist2: - - def test_gpt_neox_20b(self): - with Runner('lmi', 'octocoder') as r: - prepare.build_lmi_dist_model("octocoder") - r.launch() - client.run("lmi_dist octocoder".split()) - - def test_speculative_llama_13b(self): - with Runner('lmi', 'speculative-llama-13b') as r: - prepare.build_lmi_dist_model("speculative-llama-13b") - r.launch() - client.run("lmi_dist speculative-llama-13b".split()) - - def test_starcoder2_7b(self): - with Runner('lmi', 'starcoder2-7b') as r: - prepare.build_lmi_dist_model("starcoder2-7b") - r.launch() - client.run("lmi_dist starcoder2-7b".split()) - - def test_gemma_2b(self): - with Runner('lmi', 'gemma-2b') as r: - prepare.build_lmi_dist_model("gemma-2b") - r.launch() - client.run("lmi_dist gemma-2b".split()) - - def test_llama2_13b_gptq(self): - with Runner('lmi', 'llama2-13b-gptq') as r: - prepare.build_lmi_dist_model("llama2-13b-gptq") - r.launch() - client.run("lmi_dist llama2-13b-gptq".split()) - - def test_mistral_7b(self): - with Runner('lmi', 'mistral-7b') as r: - prepare.build_lmi_dist_model("mistral-7b") - r.launch() - client.run("lmi_dist mistral-7b".split()) - - def test_llama2_7b_32k(self): - with Runner('lmi', 'llama2-7b-32k') as r: - prepare.build_lmi_dist_model("llama2-7b-32k") - r.launch() - client.run("lmi_dist llama2-7b-32k".split()) - - def test_mistral_7b_128k_awq(self): - with Runner('lmi', 'mistral-7b-128k-awq') as r: - prepare.build_lmi_dist_model("mistral-7b-128k-awq") - r.launch() - client.run("lmi_dist mistral-7b-128k-awq".split()) - - def test_llama2_7b_chat(self): - with Runner('lmi', 'llama2-7b-chat') as r: - prepare.build_lmi_dist_model("llama2-7b-chat") - r.launch() - client.run("lmi_dist_chat llama2-7b-chat".split()) - - def test_llama31_8b_secure(self): - with Runner('lmi', 'llama-3.1-8b') as r: - prepare.build_lmi_dist_model("llama-3.1-8b") - envs = [ - "SAGEMAKER_SECURE_MODE=True", - "SAGEMAKER_SECURITY_CONTROLS=DISALLOW_CUSTOM_INFERENCE_SCRIPTS" - ] - r.launch(env_vars=envs) - client.run("lmi_dist llama-3.1-8b".split()) - - def test_tiny_llama_input_length_exceeded(self): - with Runner('lmi', 'tinyllama-test-input-length-exceeded') as r: - prepare.build_lmi_dist_model("tinyllama-input-len-exceeded") - r.launch() - start = time.perf_counter() - with pytest.raises(ValueError, match=r".*424.*"): - client.run( - "lmi_dist tinyllama-input-len-exceeded --in_tokens 100". - split()) - req_time = time.perf_counter() - start - assert req_time < 20 - client.run( - "vllm tinyllama-input-len-exceeded --in_tokens 10".split()) - - -@pytest.mark.lmi_dist -@pytest.mark.gpu_4 -class TestLmiDistMultiNode: - - def test_llama3_8b(self): - with Runner('lmi', 'llama3-8b') as r: - prepare.build_lmi_dist_model("llama3-8b") - r.launch(cmd="multi_node") - client.run("lmi_dist llama3-8b --in_tokens 1200".split()) - - @pytest.mark.vllm @pytest.mark.gpu_4 class TestVllm1: @@ -679,70 +496,6 @@ def test_lora_phi2_async(self): client.run("vllm_async_adapters phi2-unmerged-lora".split()) -@pytest.mark.lmi_dist -@pytest.mark.lora -@pytest.mark.gpu_4 -class TestLmiDistLora: - - def test_lora_llama2_7b(self): - with Runner('lmi', 'llama-7b-unmerged-lora') as r: - prepare.build_lmi_dist_model("llama-7b-unmerged-lora") - r.launch() - client.run("lmi_dist_adapters llama-7b-unmerged-lora".split()) - - def test_lora_llama2_7b_overflow(self): - with Runner('lmi', 'llama-7b-unmerged-lora-overflow') as r: - prepare.build_lmi_dist_model("llama-7b-unmerged-lora-overflow") - r.launch() - client.run( - "lmi_dist_adapters llama-7b-unmerged-lora-overflow".split()) - - def test_lora_llama2_13b_awq(self): - with Runner('lmi', 'llama2-13b-awq-unmerged-lora') as r: - prepare.build_lmi_dist_model("llama2-13b-awq-unmerged-lora") - r.launch() - client.run( - "lmi_dist_adapters llama2-13b-awq-unmerged-lora".split()) - - def test_lora_mistral_7b(self): - with Runner('lmi', 'mistral-7b-unmerged-lora') as r: - prepare.build_lmi_dist_model("mistral-7b-unmerged-lora") - r.launch() - client.run("lmi_dist_adapters mistral-7b-unmerged-lora".split()) - - def test_lora_mistral_7b_awq(self): - with Runner('lmi', 'mistral-7b-awq-unmerged-lora') as r: - prepare.build_lmi_dist_model("mistral-7b-awq-unmerged-lora") - r.launch() - client.run( - "lmi_dist_adapters mistral-7b-awq-unmerged-lora".split()) - - def test_lora_mistral_7b_gptq(self): - with Runner('lmi', 'mistral-7b-gptq-unmerged-lora') as r: - prepare.build_lmi_dist_model("mistral-7b-gptq-unmerged-lora") - r.launch() - client.run( - "lmi_dist_adapters mistral-7b-gptq-unmerged-lora".split()) - - def test_lora_llama3_8b(self): - with Runner('lmi', 'llama3-8b-unmerged-lora') as r: - prepare.build_lmi_dist_model("llama3-8b-unmerged-lora") - r.launch() - client.run("lmi_dist_adapters llama3-8b-unmerged-lora".split()) - - def test_lora_gemma_7b(self): - with Runner('lmi', 'gemma-7b-unmerged-lora') as r: - prepare.build_lmi_dist_model("gemma-7b-unmerged-lora") - r.launch() - client.run("lmi_dist_adapters gemma-7b-unmerged-lora".split()) - - def test_lora_phi2(self): - with Runner('lmi', 'phi2-unmerged-lora') as r: - prepare.build_lmi_dist_model("phi2-unmerged-lora") - r.launch() - client.run("lmi_dist_adapters phi2-unmerged-lora".split()) - - @pytest.mark.inf class TestNeuronx1: @@ -942,24 +695,6 @@ def test_mistral_7b_fp8(self): "correctness trtllm-mistral-7b-instruct-v0.3-fp8".split()) -@pytest.mark.correctness -@pytest.mark.lmi_dist -@pytest.mark.gpu_4 -class TestCorrectnessLmiDist: - - def test_codestral_22b(self): - with Runner('lmi', 'codestral-22b') as r: - prepare.build_correctness_model("lmi-dist-codestral-22b") - r.launch() - client.run("correctness lmi-dist-codestral-22b".split()) - - def test_llama3_1_8b(self): - with Runner('lmi', 'llama3-1-8b') as r: - prepare.build_correctness_model("lmi-dist-llama3-1-8b") - r.launch() - client.run("correctness lmi-dist-llama3-1-8b".split()) - - @pytest.mark.correctness @pytest.mark.inf class TestCorrectnessNeuronx: @@ -977,33 +712,6 @@ def test_llama3_2_1b(self): client.run("correctness neuronx-llama3-2-1b".split()) -class TestMultiModalLmiDist: - - def test_llava_next(self): - with Runner('lmi', 'llava_v1.6-mistral') as r: - prepare.build_lmi_dist_model('llava_v1.6-mistral') - r.launch() - client.run("multimodal llava_v1.6-mistral".split()) - - def test_phi3_v(self): - with Runner('lmi', 'phi-3-vision-128k-instruct') as r: - prepare.build_lmi_dist_model('phi-3-vision-128k-instruct') - r.launch() - client.run("multimodal phi-3-vision-128k-instruct".split()) - - def test_pixtral_12b(self): - with Runner('lmi', 'pixtral-12b') as r: - prepare.build_lmi_dist_model('pixtral-12b') - r.launch() - client.run("multimodal pixtral-12b".split()) - - def test_mllama_11b(self): - with Runner('lmi', 'llama32-11b-multimodal') as r: - prepare.build_lmi_dist_model('llama32-11b-multimodal') - r.launch() - client.run("multimodal llama32-11b-multimodal".split()) - - class TestMultiModalVllm: def test_llava_next(self): @@ -1032,45 +740,6 @@ def test_pixtral_12b(self): # client.run("multimodal llama32-11b-multimodal".split()) -class TestLmiDistPipelineParallel: - - def test_llama32_3b_multi_worker_tp1_pp1(self): - with Runner('lmi', 'llama32-3b-multi-worker-tp1-pp1') as r: - prepare.build_lmi_dist_model("llama32-3b-multi-worker-tp1-pp1") - r.launch() - client.run("lmi_dist llama32-3b-multi-worker-tp1-pp1".split()) - - def test_llama32_3b_multi_worker_tp2_pp1(self): - with Runner('lmi', 'llama32-3b-multi-worker-tp2-pp1') as r: - prepare.build_lmi_dist_model("llama32-3b-multi-worker-tp2-pp1") - r.launch() - client.run("lmi_dist llama32-3b-multi-worker-tp2-pp1".split()) - - def test_llama32_3b_multi_worker_tp1_pp2(self): - with Runner('lmi', 'llama32-3b-multi-worker-tp1-pp2') as r: - prepare.build_lmi_dist_model("llama32-3b-multi-worker-tp1-pp2") - r.launch() - client.run("lmi_dist llama32-3b-multi-worker-tp1-pp2".split()) - - def test_llama31_8b_pp_only(self): - with Runner('lmi', 'llama31-8b-pp-only') as r: - prepare.build_lmi_dist_model("llama31-8b-pp-only") - r.launch() - client.run("lmi_dist llama31-8b-pp-only".split()) - - def test_llama31_8b_tp2_pp2(self): - with Runner('lmi', 'llama31-8b-tp2-pp2') as r: - prepare.build_lmi_dist_model('llama31-8b-tp2-pp2') - r.launch() - client.run("lmi_dist llama31-8b-tp2-pp2".split()) - - def test_llama31_8b_tp2_pp2_specdec(self): - with Runner('lmi', 'llama31-8b-tp2-pp2-spec-dec') as r: - prepare.build_lmi_dist_model('llama31-8b-tp2-pp2-spec-dec') - r.launch() - client.run("lmi_dist llama31-8b-tp2-pp2-spec-dec".split()) - - @pytest.mark.vllm @pytest.mark.gpu_4 class TestVllmCustomHandlers: diff --git a/wlm/src/test/java/ai/djl/serving/wlm/ModelInfoTest.java b/wlm/src/test/java/ai/djl/serving/wlm/ModelInfoTest.java index bcc11836e..5ae29568f 100644 --- a/wlm/src/test/java/ai/djl/serving/wlm/ModelInfoTest.java +++ b/wlm/src/test/java/ai/djl/serving/wlm/ModelInfoTest.java @@ -226,7 +226,7 @@ public void testInitModel() throws IOException, ModelException { @Test public void testInferLmiEngine() throws IOException, ModelException { - // vllm/lmi-dist features enabled + // vllm features enabled System.setProperty("SERVING_FEATURES", "vllm"); Map modelToRollingBatch = Map.of(