Enforce model checkpoints existing for endpoint/bundle creation (#503)

dmchoiboi · web-flow · commit b7284df37fbb · 2024-04-26T12:36:53.000-07:00
* Enforce model checkpoints existing for endpoint/bundle creation

* Add test mock for good models info

* Clean up checkpoint validation

* Rename validate to get for semantics
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -329,6 +329,27 @@ def validate_quantization(
         )
 
 
+def validate_checkpoint_path_uri(checkpoint_path: str) -> None:
+    if not checkpoint_path.startswith("s3://"):
+        raise ObjectHasInvalidValueException(
+            f"Only S3 paths are supported. Given checkpoint path: {checkpoint_path}."
+        )
+
+
+def get_checkpoint_path(model_name: str, checkpoint_path_override: Optional[str]) -> str:
+    checkpoint_path = (
+        SUPPORTED_MODELS_INFO[model_name].s3_repo
+        if not checkpoint_path_override
+        else checkpoint_path_override
+    )
+
+    if not checkpoint_path:
+        raise InvalidRequestException(f"No checkpoint path found for model {model_name}")
+
+    validate_checkpoint_path_uri(checkpoint_path)
+    return checkpoint_path
+
+
 class CreateLLMModelBundleV1UseCase:
     def __init__(
         self,
@@ -449,22 +470,16 @@ async def create_text_generation_inference_bundle(
             max_total_tokens = 4096
 
         subcommands = []
-        if checkpoint_path is not None:
-            if checkpoint_path.startswith("s3://"):
-                final_weights_folder = "model_files"
 
-                subcommands += self.load_model_weights_sub_commands(
-                    LLMInferenceFramework.TEXT_GENERATION_INFERENCE,
-                    framework_image_tag,
-                    checkpoint_path,
-                    final_weights_folder,
-                )
-            else:
-                raise ObjectHasInvalidValueException(
-                    f"Only S3 paths are supported. Given checkpoint path: {checkpoint_path}."
-                )
-        else:
-            final_weights_folder = SUPPORTED_MODELS_INFO[model_name].hf_repo
+        checkpoint_path = get_checkpoint_path(model_name, checkpoint_path)
+        final_weights_folder = "model_files"
+
+        subcommands += self.load_model_weights_sub_commands(
+            LLMInferenceFramework.TEXT_GENERATION_INFERENCE,
+            framework_image_tag,
+            checkpoint_path,
+            final_weights_folder,
+        )
 
         subcommands.append(
             f"text-generation-launcher --hostname :: --model-id {final_weights_folder}  --num-shard {num_shards} --port 5005 --max-input-length {max_input_length} --max-total-tokens {max_total_tokens}"
@@ -672,25 +687,19 @@ async def create_vllm_bundle(
                 break
 
         subcommands = []
-        if checkpoint_path is not None:
-            if checkpoint_path.startswith("s3://"):
-                # added as workaround since transformers doesn't support mistral yet, vllm expects "mistral" in model weights folder
-                if "mistral" in model_name:
-                    final_weights_folder = "mistral_files"
-                else:
-                    final_weights_folder = "model_files"
-                subcommands += self.load_model_weights_sub_commands(
-                    LLMInferenceFramework.VLLM,
-                    framework_image_tag,
-                    checkpoint_path,
-                    final_weights_folder,
-                )
-            else:
-                raise ObjectHasInvalidValueException(
-                    f"Only S3 paths are supported. Given checkpoint path: {checkpoint_path}."
-                )
+
+        checkpoint_path = get_checkpoint_path(model_name, checkpoint_path)
+        # added as workaround since transformers doesn't support mistral yet, vllm expects "mistral" in model weights folder
+        if "mistral" in model_name:
+            final_weights_folder = "mistral_files"
         else:
-            final_weights_folder = SUPPORTED_MODELS_INFO[model_name].hf_repo
+            final_weights_folder = "model_files"
+        subcommands += self.load_model_weights_sub_commands(
+            LLMInferenceFramework.VLLM,
+            framework_image_tag,
+            checkpoint_path,
+            final_weights_folder,
+        )
 
         if max_model_len:
             subcommands.append(
@@ -770,21 +779,15 @@ async def create_lightllm_bundle(
             max_req_total_len = 4096
 
         subcommands = []
-        if checkpoint_path is not None:
-            if checkpoint_path.startswith("s3://"):
-                final_weights_folder = "model_files"
-                subcommands += self.load_model_weights_sub_commands(
-                    LLMInferenceFramework.LIGHTLLM,
-                    framework_image_tag,
-                    checkpoint_path,
-                    final_weights_folder,
-                )
-            else:
-                raise ObjectHasInvalidValueException(
-                    f"Only S3 paths are supported. Given checkpoint path: {checkpoint_path}."
-                )
-        else:
-            final_weights_folder = SUPPORTED_MODELS_INFO[model_name].hf_repo
+
+        checkpoint_path = get_checkpoint_path(model_name, checkpoint_path)
+        final_weights_folder = "model_files"
+        subcommands += self.load_model_weights_sub_commands(
+            LLMInferenceFramework.LIGHTLLM,
+            framework_image_tag,
+            checkpoint_path,
+            final_weights_folder,
+        )
 
         subcommands.append(
             f"python -m lightllm.server.api_server --model_dir {final_weights_folder} --port 5005 --tp {num_shards} --max_total_token_num {max_total_token_num} --max_req_input_len {max_req_input_len} --max_req_total_len {max_req_total_len} --tokenizer_mode auto"
@@ -835,20 +838,18 @@ async def create_tensorrt_llm_bundle(
         command = []
 
         subcommands = []
-        if checkpoint_path is not None:
-            if checkpoint_path.startswith("s3://"):
-                subcommands += self.load_model_files_sub_commands_trt_llm(
-                    checkpoint_path,
-                )
-            else:
-                raise ObjectHasInvalidValueException(
-                    f"Only S3 paths are supported. Given checkpoint path: {checkpoint_path}."
-                )
-        else:
+
+        if not checkpoint_path:
             raise ObjectHasInvalidValueException(
                 "Checkpoint must be provided for TensorRT-LLM models."
             )
 
+        validate_checkpoint_path_uri(checkpoint_path)
+
+        subcommands += self.load_model_files_sub_commands_trt_llm(
+            checkpoint_path,
+        )
+
         subcommands.append(
             f"python3 launch_triton_server.py --world_size={num_shards} --model_repo=./model_repo/"
         )
diff --git a/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py b/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py
@@ -1,7 +1,6 @@
 import os
-from collections import namedtuple
 from functools import lru_cache
-from typing import Dict, Optional
+from typing import Dict, NamedTuple, Optional
 
 from huggingface_hub import list_repo_refs
 from huggingface_hub.utils._errors import RepositoryNotFoundError
@@ -25,7 +24,9 @@
 TOKENIZER_TARGET_DIR = "/opt/.cache/model_engine_server/tokenizers"
 
 
-ModelInfo = namedtuple("ModelInfo", ["hf_repo", "s3_repo"])
+class ModelInfo(NamedTuple):
+    hf_repo: str
+    s3_repo: Optional[str]
 
 
 def get_default_supported_models_info() -> Dict[str, ModelInfo]:
diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -1,5 +1,5 @@
 import json
-from typing import Any, List, Tuple
+from typing import Any, Dict, List, Tuple
 from unittest import mock
 
 import pytest
@@ -54,9 +54,22 @@
     validate_and_update_completion_params,
 )
 from model_engine_server.domain.use_cases.model_bundle_use_cases import CreateModelBundleV2UseCase
+from model_engine_server.infra.repositories import live_tokenizer_repository
+from model_engine_server.infra.repositories.live_tokenizer_repository import ModelInfo
+
+
+def good_models_info() -> Dict[str, ModelInfo]:
+    return {
+        k: ModelInfo(v.hf_repo, "s3://test-s3.tar")
+        for k, v in live_tokenizer_repository.SUPPORTED_MODELS_INFO.items()
+    }
 
 
 @pytest.mark.asyncio
+@mock.patch(
+    "model_engine_server.domain.use_cases.llm_model_endpoint_use_cases.SUPPORTED_MODELS_INFO",
+    good_models_info(),
+)
 async def test_create_model_endpoint_use_case_success(
     test_api_key: str,
     fake_model_bundle_repository,
@@ -170,6 +183,82 @@ async def test_create_model_endpoint_use_case_success(
     assert "--max-total-tokens" in bundle.flavor.command[-1] and "4096" in bundle.flavor.command[-1]
 
 
+def bad_models_info() -> Dict[str, ModelInfo]:
+    info = {
+        k: ModelInfo(v.hf_repo, v.s3_repo)
+        for k, v in live_tokenizer_repository.SUPPORTED_MODELS_INFO.items()
+    }
+    info.update(
+        {
+            "mpt-7b": ModelInfo("mosaicml/mpt-7b", None),
+            "mpt-7b-instruct": ModelInfo("mosaicml/mpt-7b-instruct", "gibberish"),
+        }
+    )
+    return info
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "inference_framework, model_name, expected_error",
+    [
+        (LLMInferenceFramework.TEXT_GENERATION_INFERENCE, "mpt-7b", InvalidRequestException),
+        (
+            LLMInferenceFramework.TEXT_GENERATION_INFERENCE,
+            "mpt-7b-instruct",
+            ObjectHasInvalidValueException,
+        ),
+        (LLMInferenceFramework.LIGHTLLM, "mpt-7b", InvalidRequestException),
+        (LLMInferenceFramework.LIGHTLLM, "mpt-7b-instruct", ObjectHasInvalidValueException),
+        (LLMInferenceFramework.VLLM, "mpt-7b", InvalidRequestException),
+        (LLMInferenceFramework.VLLM, "mpt-7b-instruct", ObjectHasInvalidValueException),
+    ],
+)
+@mock.patch(
+    "model_engine_server.domain.use_cases.llm_model_endpoint_use_cases.SUPPORTED_MODELS_INFO",
+    bad_models_info(),
+)
+async def test_create_model_bundle_fails_if_no_checkpoint(
+    test_api_key: str,
+    fake_model_bundle_repository,
+    fake_model_endpoint_service,
+    fake_docker_repository_image_always_exists,
+    fake_model_primitive_gateway,
+    fake_llm_artifact_gateway,
+    create_llm_model_endpoint_text_generation_inference_request_streaming: CreateLLMModelEndpointV1Request,
+    inference_framework,
+    model_name,
+    expected_error,
+):
+    fake_model_endpoint_service.model_bundle_repository = fake_model_bundle_repository
+    bundle_use_case = CreateModelBundleV2UseCase(
+        model_bundle_repository=fake_model_bundle_repository,
+        docker_repository=fake_docker_repository_image_always_exists,
+        model_primitive_gateway=fake_model_primitive_gateway,
+    )
+    use_case = CreateLLMModelBundleV1UseCase(
+        create_model_bundle_use_case=bundle_use_case,
+        model_bundle_repository=fake_model_bundle_repository,
+        llm_artifact_gateway=fake_llm_artifact_gateway,
+        docker_repository=fake_docker_repository_image_always_exists,
+    )
+    user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)
+    request = create_llm_model_endpoint_text_generation_inference_request_streaming.copy()
+
+    with pytest.raises(expected_error):
+        await use_case.execute(
+            user=user,
+            endpoint_name=request.name,
+            model_name=model_name,
+            source=request.source,
+            framework=inference_framework,
+            framework_image_tag="0.0.0",
+            endpoint_type=request.endpoint_type,
+            num_shards=request.num_shards,
+            quantize=request.quantize,
+            checkpoint_path=None,
+        )
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "valid, inference_framework, inference_framework_image_tag",
@@ -180,6 +269,10 @@ async def test_create_model_endpoint_use_case_success(
         (True, LLMInferenceFramework.VLLM, "0.1.3.6"),
     ],
 )
+@mock.patch(
+    "model_engine_server.domain.use_cases.llm_model_endpoint_use_cases.SUPPORTED_MODELS_INFO",
+    good_models_info(),
+)
 async def test_create_model_bundle_inference_framework_image_tag_validation(
     test_api_key: str,
     fake_model_bundle_repository,