Hardcode llama 3 70b endpoint param (#524)

yunfeng-scale · web-flow · commit fe56840dfe04 · 2024-05-21T10:47:40.000-07:00
* Hardcode some tuning for endpoints

* remove mixtral 8x22b hardcode

* test
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -677,6 +677,9 @@ async def create_vllm_bundle(
         if hmi_config.sensitive_log_mode:  # pragma: no cover
             subcommands[-1] = subcommands[-1] + " --disable-log-requests"
 
+        if "llama-3-70b" in model_name:
+            subcommands[-1] = subcommands[-1] + " --gpu-memory-utilization 0.95 --enforce-eager"
+
         command = [
             "/bin/bash",
             "-c",
diff --git a/model-engine/tests/unit/conftest.py b/model-engine/tests/unit/conftest.py
@@ -763,6 +763,7 @@ def __init__(self):
             "llama-7b/special_tokens_map.json": ["llama-7b/special_tokens_map.json"],
             "llama-2-7b": ["model-fake.safetensors"],
             "mpt-7b": ["model-fake.safetensors"],
+            "llama-3-70b": ["model-fake.safetensors"],
         }
         self.urls = {"filename": "https://test-bucket.s3.amazonaws.com/llm/llm-1.0.0.tar.gz"}
         self.model_config = {
diff --git a/model-engine/tests/unit/domain/conftest.py b/model-engine/tests/unit/domain/conftest.py
@@ -292,6 +292,33 @@ def create_llm_model_endpoint_request_llama_2() -> CreateLLMModelEndpointV1Reque
     )
 
 
+@pytest.fixture
+def create_llm_model_endpoint_request_llama_3_70b() -> CreateLLMModelEndpointV1Request:
+    return CreateLLMModelEndpointV1Request(
+        name="test_llm_endpoint_name_llama_3_70b",
+        model_name="llama-3-70b",
+        source="hugging_face",
+        inference_framework="vllm",
+        inference_framework_image_tag="1.0.0",
+        num_shards=2,
+        endpoint_type=ModelEndpointType.STREAMING,
+        metadata={},
+        post_inference_hooks=["billing"],
+        cpus=1,
+        gpus=2,
+        memory="8G",
+        gpu_type=GpuType.NVIDIA_HOPPER_H100,
+        storage="10G",
+        min_workers=1,
+        max_workers=3,
+        per_worker=2,
+        labels={"team": "infra", "product": "my_product"},
+        aws_role="test_aws_role",
+        results_s3_bucket="test_s3_bucket",
+        checkpoint_path="s3://llama-3-70b",
+    )
+
+
 @pytest.fixture
 def create_llm_model_endpoint_text_generation_inference_request_streaming() -> (
     CreateLLMModelEndpointV1Request
diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -80,6 +80,7 @@ async def test_create_model_endpoint_use_case_success(
     create_llm_model_endpoint_request_sync: CreateLLMModelEndpointV1Request,
     create_llm_model_endpoint_request_streaming: CreateLLMModelEndpointV1Request,
     create_llm_model_endpoint_request_llama_2: CreateLLMModelEndpointV1Request,
+    create_llm_model_endpoint_request_llama_3_70b: CreateLLMModelEndpointV1Request,
 ):
     fake_model_endpoint_service.model_bundle_repository = fake_model_bundle_repository
     bundle_use_case = CreateModelBundleV2UseCase(
@@ -182,6 +183,16 @@ async def test_create_model_endpoint_use_case_success(
     )
     assert "--max-total-tokens" in bundle.flavor.command[-1] and "4096" in bundle.flavor.command[-1]
 
+    response_5 = await use_case.execute(
+        user=user, request=create_llm_model_endpoint_request_llama_3_70b
+    )
+    assert response_5.endpoint_creation_task_id
+    assert isinstance(response_5, CreateLLMModelEndpointV1Response)
+    bundle = await fake_model_bundle_repository.get_latest_model_bundle_by_name(
+        owner=user.team_id, name=create_llm_model_endpoint_request_llama_3_70b.name
+    )
+    assert " --gpu-memory-utilization 0.95" in bundle.flavor.command[-1]
+
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -763,6 +763,7 @@ def __init__(self):`
`763`	`763`	`"llama-7b/special_tokens_map.json": ["llama-7b/special_tokens_map.json"],`
`764`	`764`	`"llama-2-7b": ["model-fake.safetensors"],`
`765`	`765`	`"mpt-7b": ["model-fake.safetensors"],`
	`766`	`+ "llama-3-70b": ["model-fake.safetensors"],`
`766`	`767`	`}`
`767`	`768`	`self.urls = {"filename": "https://test-bucket.s3.amazonaws.com/llm/llm-1.0.0.tar.gz"}`
`768`	`769`	`self.model_config = {`