require safetensors (#510)

saiatmakuri · web-flow · commit 901bf6d1cffd · 2024-05-06T13:26:57.000-04:00
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -275,24 +275,6 @@ async def _get_latest_tag(inference_framework: LLMInferenceFramework) -> str:
     return config_map[inference_framework]
 
 
-def _include_safetensors_bin_or_pt(model_files: List[str]) -> Optional[str]:
-    """
-    This function is used to determine whether to include "*.safetensors", "*.bin", or "*.pt" files
-    based on which file type is present most often in the checkpoint folder. The most
-    frequently present file type is included.
-    In case of ties, priority is given to "*.safetensors", then "*.bin", then "*.pt".
-    """
-    num_safetensors = len([f for f in model_files if f.endswith(".safetensors")])
-    num_bin = len([f for f in model_files if f.endswith(".bin")])
-    num_pt = len([f for f in model_files if f.endswith(".pt")])
-    maximum = max(num_safetensors, num_bin, num_pt)
-    if num_safetensors == maximum:
-        return "*.safetensors"
-    if num_bin == maximum:
-        return "*.bin"
-    return "*.pt"
-
-
 def _model_endpoint_entity_to_get_llm_model_endpoint_response(
     model_endpoint: ModelEndpoint,
 ) -> GetLLMModelEndpointV1Response:
@@ -354,6 +336,10 @@ def validate_checkpoint_path_uri(checkpoint_path: str) -> None:
         raise ObjectHasInvalidValueException(
             f"Only S3 paths are supported. Given checkpoint path: {checkpoint_path}."
         )
+    if checkpoint_path.endswith(".tar"):
+        raise ObjectHasInvalidValueException(
+            f"Tar files are not supported. Given checkpoint path: {checkpoint_path}."
+        )
 
 
 def get_checkpoint_path(model_name: str, checkpoint_path_override: Optional[str]) -> str:
@@ -370,6 +356,14 @@ def get_checkpoint_path(model_name: str, checkpoint_path_override: Optional[str]
     return checkpoint_path
 
 
+def validate_checkpoint_files(checkpoint_files: List[str]) -> None:
+    """Require safetensors in the checkpoint path."""
+    model_files = [f for f in checkpoint_files if "model" in f]
+    num_safetensors = len([f for f in model_files if f.endswith(".safetensors")])
+    if num_safetensors == 0:
+        raise ObjectHasInvalidValueException("No safetensors found in the checkpoint path.")
+
+
 class CreateLLMModelBundleV1UseCase:
     def __init__(
         self,
@@ -557,27 +551,14 @@ def load_model_weights_sub_commands(
         else:
             s5cmd = "./s5cmd"
 
-        base_path = checkpoint_path.split("/")[-1]
-        if base_path.endswith(".tar"):
-            # If the checkpoint file is a tar file, extract it into final_weights_folder
-            subcommands.extend(
-                [
-                    f"{s5cmd} cp {checkpoint_path} .",
-                    f"mkdir -p {final_weights_folder}",
-                    f"tar --no-same-owner -xf {base_path} -C {final_weights_folder}",
-                ]
-            )
-        else:
-            # Let's check whether to exclude "*.safetensors" or "*.bin" files
-            checkpoint_files = self.llm_artifact_gateway.list_files(checkpoint_path)
-            model_files = [f for f in checkpoint_files if "model" in f]
-
-            include_str = _include_safetensors_bin_or_pt(model_files)
-            file_selection_str = f"--include '*.model' --include '*.json' --include '{include_str}' --exclude 'optimizer*'"
-            subcommands.append(
-                f"{s5cmd} --numworkers 512 cp --concurrency 10 {file_selection_str} {os.path.join(checkpoint_path, '*')} {final_weights_folder}"
-            )
+        checkpoint_files = self.llm_artifact_gateway.list_files(checkpoint_path)
+        validate_checkpoint_files(checkpoint_files)
 
+        # filter to configs ('*.model' and '*.json') and weights ('*.safetensors')
+        file_selection_str = "--include '*.model' --include '*.json' --include '*.safetensors' --exclude 'optimizer*'"
+        subcommands.append(
+            f"{s5cmd} --numworkers 512 cp --concurrency 10 {file_selection_str} {os.path.join(checkpoint_path, '*')} {final_weights_folder}"
+        )
         return subcommands
 
     def load_model_files_sub_commands_trt_llm(
@@ -591,19 +572,9 @@ def load_model_files_sub_commands_trt_llm(
         See llm-engine/model-engine/model_engine_server/inference/tensorrt-llm/triton_model_repo/tensorrt_llm/config.pbtxt
         and llm-engine/model-engine/model_engine_server/inference/tensorrt-llm/triton_model_repo/postprocessing/config.pbtxt
         """
-        subcommands = []
-
-        base_path = checkpoint_path.split("/")[-1]
-
-        if base_path.endswith(".tar"):
-            raise ObjectHasInvalidValueException(
-                "Checkpoint for TensorRT-LLM models must be a folder, not a tar file."
-            )
-        else:
-            subcommands.append(
-                f"./s5cmd --numworkers 512 cp --concurrency 50 {os.path.join(checkpoint_path, '*')} ./"
-            )
-
+        subcommands = [
+            f"./s5cmd --numworkers 512 cp --concurrency 50 {os.path.join(checkpoint_path, '*')} ./"
+        ]
         return subcommands
 
     async def create_deepspeed_bundle(
diff --git a/model-engine/tests/unit/conftest.py b/model-engine/tests/unit/conftest.py
@@ -757,21 +757,25 @@ class FakeLLMArtifactGateway(LLMArtifactGateway):
     def __init__(self):
         self.existing_models = []
         self.s3_bucket = {
-            "fake-checkpoint": ["fake.bin, fake2.bin", "fake3.safetensors"],
+            "fake-checkpoint": ["model-fake.bin, model-fake2.bin", "model-fake.safetensors"],
             "llama-7b/tokenizer.json": ["llama-7b/tokenizer.json"],
             "llama-7b/tokenizer_config.json": ["llama-7b/tokenizer_config.json"],
             "llama-7b/special_tokens_map.json": ["llama-7b/special_tokens_map.json"],
+            "llama-2-7b": ["model-fake.safetensors"],
+            "mpt-7b": ["model-fake.safetensors"],
         }
         self.urls = {"filename": "https://test-bucket.s3.amazonaws.com/llm/llm-1.0.0.tar.gz"}
 
     def _add_model(self, owner: str, model_name: str):
         self.existing_models.append((owner, model_name))
 
     def list_files(self, path: str, **kwargs) -> List[str]:
+        path = path.lstrip("s3://")
         if path in self.s3_bucket:
             return self.s3_bucket[path]
 
     def download_files(self, path: str, target_path: str, overwrite=False, **kwargs) -> List[str]:
+        path = path.lstrip("s3://")
         if path in self.s3_bucket:
             return self.s3_bucket[path]
 
diff --git a/model-engine/tests/unit/domain/conftest.py b/model-engine/tests/unit/domain/conftest.py
@@ -196,6 +196,7 @@ def create_llm_model_endpoint_request_sync() -> CreateLLMModelEndpointV1Request:
         labels={"team": "infra", "product": "my_product"},
         aws_role="test_aws_role",
         results_s3_bucket="test_s3_bucket",
+        checkpoint_path="s3://mpt-7b",
     )
 
 
@@ -222,7 +223,7 @@ def create_llm_model_endpoint_request_async() -> CreateLLMModelEndpointV1Request
         labels={"team": "infra", "product": "my_product"},
         aws_role="test_aws_role",
         results_s3_bucket="test_s3_bucket",
-        checkpoint_path="s3://test-s3.tar",
+        checkpoint_path="s3://llama-2-7b",
     )
 
 
@@ -249,14 +250,15 @@ def create_llm_model_endpoint_request_streaming() -> CreateLLMModelEndpointV1Req
         labels={"team": "infra", "product": "my_product"},
         aws_role="test_aws_role",
         results_s3_bucket="test_s3_bucket",
+        checkpoint_path="s3://mpt-7b",
     )
 
 
 @pytest.fixture
 def update_llm_model_endpoint_request() -> UpdateLLMModelEndpointV1Request:
     return UpdateLLMModelEndpointV1Request(
         inference_framework_image_tag="latest",
-        checkpoint_path="s3://test_checkpoint_path",
+        checkpoint_path="s3://mpt-7b",
         memory="4G",
         min_workers=0,
         max_workers=1,
@@ -286,7 +288,7 @@ def create_llm_model_endpoint_request_llama_2() -> CreateLLMModelEndpointV1Reque
         labels={"team": "infra", "product": "my_product"},
         aws_role="test_aws_role",
         results_s3_bucket="test_s3_bucket",
-        checkpoint_path="s3://test-s3.tar",
+        checkpoint_path="s3://llama-2-7b",
     )
 
 
@@ -315,6 +317,7 @@ def create_llm_model_endpoint_text_generation_inference_request_streaming() -> (
         labels={"team": "infra", "product": "my_product"},
         aws_role="test_aws_role",
         results_s3_bucket="test_s3_bucket",
+        checkpoint_path="s3://mpt-7b",
     )
 
 
diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -49,9 +49,9 @@
     GpuType,
     ModelDownloadV1UseCase,
     UpdateLLMModelEndpointV1UseCase,
-    _include_safetensors_bin_or_pt,
     infer_hardware_from_model_name,
     validate_and_update_completion_params,
+    validate_checkpoint_files,
 )
 from model_engine_server.domain.use_cases.model_bundle_use_cases import CreateModelBundleV2UseCase
 
@@ -141,7 +141,7 @@ async def test_create_model_endpoint_use_case_success(
             "inference_framework_image_tag": create_llm_model_endpoint_request_sync.inference_framework_image_tag,
             "num_shards": create_llm_model_endpoint_request_sync.num_shards,
             "quantize": None,
-            "checkpoint_path": None,
+            "checkpoint_path": create_llm_model_endpoint_request_sync.checkpoint_path,
         }
     }
 
@@ -166,7 +166,7 @@ async def test_create_model_endpoint_use_case_success(
             "inference_framework_image_tag": create_llm_model_endpoint_request_streaming.inference_framework_image_tag,
             "num_shards": create_llm_model_endpoint_request_streaming.num_shards,
             "quantize": None,
-            "checkpoint_path": None,
+            "checkpoint_path": create_llm_model_endpoint_request_sync.checkpoint_path,
         }
     }
 
@@ -295,7 +295,6 @@ async def test_create_model_bundle_inference_framework_image_tag_validation(
     request = create_llm_model_endpoint_text_generation_inference_request_streaming.copy()
     request.inference_framework = inference_framework
     request.inference_framework_image_tag = inference_framework_image_tag
-    request.checkpoint_path = "s3://test-s3.tar"
     user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)
     if valid:
         await use_case.execute(user=user, request=request)
@@ -1755,34 +1754,16 @@ async def test_delete_public_inference_model_raises_not_authorized(
 
 
 @pytest.mark.asyncio
-async def test_include_safetensors_bin_or_pt_majority_safetensors():
-    fake_model_files = ["fake.bin", "fake2.safetensors", "model.json", "optimizer.pt"]
-    assert _include_safetensors_bin_or_pt(fake_model_files) == "*.safetensors"
-
-
-@pytest.mark.asyncio
-async def test_include_safetensors_bin_or_pt_majority_bin():
-    fake_model_files = [
-        "fake.bin",
-        "fake2.bin",
-        "fake3.safetensors",
-        "model.json",
-        "optimizer.pt",
-        "fake4.pt",
-    ]
-    assert _include_safetensors_bin_or_pt(fake_model_files) == "*.bin"
+async def test_validate_checkpoint_files_no_safetensors():
+    fake_model_files = ["model-fake.bin", "model.json", "optimizer.pt"]
+    with pytest.raises(ObjectHasInvalidValueException):
+        validate_checkpoint_files(fake_model_files)
 
 
 @pytest.mark.asyncio
-async def test_include_safetensors_bin_or_pt_majority_pt():
-    fake_model_files = [
-        "fake.bin",
-        "fake2.safetensors",
-        "model.json",
-        "optimizer.pt",
-        "fake3.pt",
-    ]
-    assert _include_safetensors_bin_or_pt(fake_model_files) == "*.pt"
+async def test_validate_checkpoint_files_safetensors_with_other_files():
+    fake_model_files = ["model-fake.bin", "model-fake2.safetensors", "model.json", "optimizer.pt"]
+    validate_checkpoint_files(fake_model_files)  # No exception should be raised
 
 
 def test_infer_hardware_from_model_name():