Integrate LightLLM (#273)

yunfeng-scale · web-flow · commit 6b354285358a · 2023-09-12T13:14:04.000-07:00
* Integrate LightLLM

* wip
diff --git a/charts/model-engine/values_circleci.yaml b/charts/model-engine/values_circleci.yaml
@@ -141,6 +141,7 @@ config:
       istio_enabled: true
       tgi_repository: "text-generation-inference"
       vllm_repository: "vllm"
+      lightllm_repository: "lightllm"
       hf_user_fine_tuned_weights_prefix: "s3://$CIRCLECI_AWS_S3_BUCKET"
 
 # Service Account
diff --git a/model-engine/model_engine_server/common/config.py b/model-engine/model_engine_server/common/config.py
@@ -56,6 +56,7 @@ class HostedModelInferenceServiceConfig:
     datadog_trace_enabled: bool
     tgi_repository: str
     vllm_repository: str
+    lightllm_repository: str
 
     @classmethod
     def from_yaml(cls, yaml_path):
diff --git a/model-engine/model_engine_server/domain/entities/llm_entity.py b/model-engine/model_engine_server/domain/entities/llm_entity.py
@@ -11,6 +11,7 @@ class LLMInferenceFramework(str, Enum):
     DEEPSPEED = "deepspeed"
     TEXT_GENERATION_INFERENCE = "text_generation_inference"
     VLLM = "vllm"
+    LIGHTLLM = "lightllm"
 
 
 class Quantization(str, Enum):
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -119,6 +119,15 @@
         "falcon-40b": "tiiuae/falcon-40b",
         "falcon-40b-instruct": "tiiuae/falcon-40b-instruct",
     },
+    LLMInferenceFramework.LIGHTLLM: {
+        "llama-7b": "decapoda-research/llama-7b-hf",
+        "llama-2-7b": "meta-llama/Llama-2-7b-hf",
+        "llama-2-7b-chat": "meta-llama/Llama-2-7b-chat-hf",
+        "llama-2-13b": "meta-llama/Llama-2-13b-hf",
+        "llama-2-13b-chat": "meta-llama/Llama-2-13b-chat-hf",
+        "llama-2-70b": "meta-llama/Llama-2-70b-hf",
+        "llama-2-70b-chat": "meta-llama/Llama-2-70b-chat-hf",
+    },
 }
 
 
@@ -221,6 +230,15 @@ async def create_model_bundle(
                     num_shards,
                     checkpoint_path,
                 )
+            elif framework == LLMInferenceFramework.LIGHTLLM:
+                bundle_id = await self.create_lightllm_bundle(
+                    user,
+                    model_name,
+                    framework_image_tag,
+                    endpoint_name,
+                    num_shards,
+                    checkpoint_path,
+                )
             else:
                 raise ObjectHasInvalidValueException(
                     f"Framework {framework} is not supported for source {source}."
@@ -499,6 +517,86 @@ async def create_vllm_bundle(
             )
         ).model_bundle_id
 
+    async def create_lightllm_bundle(
+        self,
+        user: User,
+        model_name: str,
+        framework_image_tag: str,
+        endpoint_unique_name: str,
+        num_shards: int,
+        checkpoint_path: Optional[str],
+    ):
+        command = []
+
+        # TODO: incorporate auto calculate max_total_token_num from https://github.com/ModelTC/lightllm/pull/81
+        max_total_token_num = 6000  # LightLLM default
+        if num_shards == 1:
+            max_total_token_num = 15000  # Default for Llama 2 7B on 1 x A10
+        elif num_shards == 2:
+            max_total_token_num = 21000  # Default for Llama 2 13B on 2 x A10
+        elif num_shards == 4:
+            max_total_token_num = 70000  # Default for Llama 2 13B on 4 x A10
+        max_req_input_len = 2047
+        max_req_total_len = 2048
+        if "llama-2" in model_name:
+            max_req_input_len = 4095
+            max_req_total_len = 4096
+
+        subcommands = []
+        if checkpoint_path is not None:
+            if checkpoint_path.startswith("s3://"):
+                final_weights_folder = "model_files"
+                subcommands += self.load_model_weights_sub_commands(
+                    LLMInferenceFramework.LIGHTLLM,
+                    framework_image_tag,
+                    checkpoint_path,
+                    final_weights_folder,
+                )
+            else:
+                raise ObjectHasInvalidValueException(
+                    f"Not able to load checkpoint path {checkpoint_path}."
+                )
+        else:
+            final_weights_folder = _SUPPORTED_MODEL_NAMES[LLMInferenceFramework.VLLM][model_name]
+
+        subcommands.append(
+            f"python -m lightllm.server.api_server --model_dir {final_weights_folder} --port 5005 --tp {num_shards} --max_total_token_num {max_total_token_num} --max_req_input_len {max_req_input_len} --max_req_total_len {max_req_total_len} --tokenizer_mode auto"
+        )
+
+        command = [
+            "/bin/bash",
+            "-c",
+            ";".join(subcommands),
+        ]
+
+        return (
+            await self.create_model_bundle_use_case.execute(
+                user,
+                CreateModelBundleV2Request(
+                    name=endpoint_unique_name,
+                    schema_location="TBA",
+                    flavor=StreamingEnhancedRunnableImageFlavor(
+                        flavor=ModelBundleFlavorType.STREAMING_ENHANCED_RUNNABLE_IMAGE,
+                        repository=hmi_config.lightllm_repository,
+                        tag=framework_image_tag,
+                        command=command,
+                        streaming_command=command,
+                        protocol="http",
+                        readiness_initial_delay_seconds=10,
+                        healthcheck_route="/health",
+                        predict_route="/generate",
+                        streaming_predict_route="/generate_stream",
+                        env={},
+                    ),
+                    metadata={},
+                ),
+                do_auth_check=False,
+                # Skip auth check because llm create endpoint is called as the user itself,
+                # but the user isn't directly making the action. It should come from the fine tune
+                # job.
+            )
+        ).model_bundle_id
+
     async def execute(
         self, user: User, request: CreateLLMModelEndpointV1Request
     ) -> CreateLLMModelEndpointV1Response:
@@ -764,6 +862,19 @@ def model_output_to_completion_output(
                 num_completion_tokens=model_output["count_output_tokens"],
                 tokens=tokens,
             )
+        elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
+            print(model_output)
+            tokens = None
+            if with_token_probs:
+                tokens = [
+                    TokenOutput(token=t["text"], log_prob=t["logprob"])
+                    for t in model_output["tokens"]
+                ]
+            return CompletionOutput(
+                text=model_output["generated_text"][0],
+                num_completion_tokens=model_output["count_output_tokens"],
+                tokens=tokens,
+            )
         else:
             raise EndpointUnsupportedInferenceTypeException(
                 f"Unsupported inference framework {model_content.inference_framework}"
@@ -925,6 +1036,44 @@ async def execute(
                 topic=model_endpoint.record.destination, predict_request=inference_request
             )
 
+            if predict_result.status != TaskStatus.SUCCESS or predict_result.result is None:
+                return CompletionSyncV1Response(
+                    request_id=request_id,
+                    output=None,
+                )
+
+            output = json.loads(predict_result.result["result"])
+            return CompletionSyncV1Response(
+                request_id=request_id,
+                output=self.model_output_to_completion_output(
+                    output, model_endpoint, request.return_token_log_probs
+                ),
+            )
+        elif endpoint_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
+            lightllm_args: Any = {
+                "inputs": request.prompt,
+                "parameters": {
+                    "max_new_tokens": request.max_new_tokens,
+                },
+            }
+            # TODO: implement stop sequences
+            if request.temperature > 0:
+                lightllm_args["parameters"]["temperature"] = request.temperature
+                lightllm_args["parameters"]["do_sample"] = True
+            else:
+                lightllm_args["parameters"]["do_sample"] = False
+            if request.return_token_log_probs:
+                lightllm_args["parameters"]["return_details"] = True
+
+            inference_request = SyncEndpointPredictV1Request(
+                args=lightllm_args,
+                num_retries=NUM_DOWNSTREAM_REQUEST_RETRIES,
+                timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS,
+            )
+            predict_result = await inference_gateway.predict(
+                topic=model_endpoint.record.destination, predict_request=inference_request
+            )
+
             if predict_result.status != TaskStatus.SUCCESS or predict_result.result is None:
                 return CompletionSyncV1Response(
                     request_id=request_id,
@@ -1055,6 +1204,25 @@ async def execute(
             if request.return_token_log_probs:
                 args["logprobs"] = 1
             args["stream"] = True
+        elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
+            args = {
+                "inputs": request.prompt,
+                "parameters": {
+                    "max_new_tokens": request.max_new_tokens,
+                },
+            }
+            # TODO: stop sequences
+            if request.temperature > 0:
+                args["parameters"]["temperature"] = request.temperature
+                args["parameters"]["do_sample"] = True
+            else:
+                args["parameters"]["do_sample"] = False
+            if request.return_token_log_probs:
+                args["parameters"]["return_details"] = True
+        else:
+            raise EndpointUnsupportedInferenceTypeException(
+                f"Unsupported inference framework {model_content.inference_framework}"
+            )
 
         inference_request = SyncEndpointPredictV1Request(
             args=args,
@@ -1163,6 +1331,30 @@ async def execute(
                         request_id=request_id,
                         output=None,
                     )
+            elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
+                if res.status == TaskStatus.SUCCESS and result is not None:
+                    print(result)
+                    token = None
+                    num_completion_tokens += 1
+                    if request.return_token_log_probs:
+                        token = TokenOutput(
+                            token=result["result"]["token"]["text"],
+                            log_prob=result["result"]["token"]["logprob"],
+                        )
+                    yield CompletionStreamV1Response(
+                        request_id=request_id,
+                        output=CompletionStreamOutput(
+                            text=result["result"]["token"]["text"],
+                            finished=result["result"]["finished"],
+                            num_completion_tokens=num_completion_tokens,
+                            token=token,
+                        ),
+                    )
+                else:
+                    yield CompletionStreamV1Response(
+                        request_id=request_id,
+                        output=None,
+                    )
             else:
                 raise EndpointUnsupportedInferenceTypeException(
                     f"Unsupported inference framework {model_content.inference_framework}"
diff --git a/model-engine/service_configs/service_config_circleci.yaml b/model-engine/service_configs/service_config_circleci.yaml
@@ -56,6 +56,7 @@ datadog_trace_enabled: false
 istio_enabled: true
 tgi_repository: "text-generation-inference"
 vllm_repository: "vllm"
+lightllm_repository: "lightllm"
 
 # S3 access
 hf_user_fine_tuned_weights_prefix: "s3://test-bucket"