enable prompt template for gguf format inference (#57)

depenglee1707 · web-flow · commit dce663258e95 · 2024-03-28T22:47:21.000+08:00
diff --git a/llmserve/backend/llm/initializers/llamacpp.py b/llmserve/backend/llm/initializers/llamacpp.py
@@ -66,9 +66,9 @@ def __init__(
 
     def _get_model_init_kwargs(self) -> Dict[str, Any]:
         return {
-            # We use a large integer to put all of the layers on GPU by default.
-            "n_gpu_layers": 0 if self.device.type == "cpu" else 10**6,
-            "seed": 0,
+            # -1 means all layers are offloaded to GPU
+            "n_gpu_layers": 0 if self.device.type == "cpu" else -1,
+            "seed": -1,
             "verbose": False,
             "n_threads": int(os.environ["OMP_NUM_THREADS"]),
             **self.model_init_kwargs,
@@ -82,15 +82,11 @@ def load_model(self, model_id: str) -> "Llama":
         # Lazy import to avoid issues on CPU head node
         from llama_cpp import Llama
 
-        return Llama(
+        self.model = Llama(
             model_path=os.path.abspath(model_path),
             **self._get_model_init_kwargs(),
         )
-
+        return self.model
+    
     def load_tokenizer(self, tokenizer_name: str) -> None:
-        return None
-
-    def postprocess(
-        self, model: "Llama", tokenizer: None
-    ) -> Tuple["Llama", LlamaCppTokenizer]:
-        return super().postprocess(model, LlamaCppTokenizer(model))
+        return LlamaCppTokenizer(self.model)
diff --git a/llmserve/backend/llm/pipelines/llamacpp/llamacpp_pipeline.py b/llmserve/backend/llm/pipelines/llamacpp/llamacpp_pipeline.py
@@ -9,6 +9,7 @@
 from ...initializers.llamacpp import LlamaCppInitializer, LlamaCppTokenizer
 from .._base import StreamingPipeline
 from ..utils import decode_stopping_sequences_where_needed, construct_prompts
+import json
 
 if TYPE_CHECKING:
     from llama_cpp import Llama, LogitsProcessorList, StoppingCriteriaList
@@ -104,20 +105,44 @@ def __call__(self, inputs: List[str], **kwargs) -> List[Response]:
             inputs, prompt_format=self.prompt_format)
 
         logger.info(inputs)
-        tokenized_inputs = self.tokenizer.encode(inputs[0])
+
+        tokenized_inputs = self.tokenizer.encode(inputs)
         kwargs = self._add_default_generate_kwargs(
             kwargs,
             model_inputs={"inputs": inputs,
                           "tokenized_inputs": tokenized_inputs},
         )
 
+        chat_completion = False
+        try:
+            inputs_bak = inputs
+            inputs = [json.loads(prompt) for prompt in inputs]
+            chat_completion = True
+        except:
+            logger.info("Seems no chat template from user")
+            inputs = inputs_bak
+
         logger.info(f"Forward params: {kwargs}, model_inputs {inputs}")
         responses = []
         for input in inputs:
             st = time.monotonic()
-            output = self.model(input, **kwargs)
+            if chat_completion:
+                kwargs.pop('stopping_criteria', None)
+                kwargs.pop('echo', None)
+                logger.info(f"Forward params: {kwargs}, model_inputs {inputs}")
+                output = self.model.create_chat_completion(
+                    messages=input,
+                    **kwargs
+                )
+                text = output["choices"][0]["message"]["content"].replace("\u200b", "").strip()
+            else:
+                output = self.model(input, **kwargs)
+                text = output["choices"][0]["text"].replace("\u200b", "").strip()
+                
+
+            logger.info(f"llm's raw response is: {output}")
             gen_time = time.monotonic() - st
-            text = output["choices"][0]["text"].replace("\u200b", "").strip()
+            
             responses.append(
                 Response(
                     generated_text=text,
@@ -178,6 +203,7 @@ def from_initializer(
         cls,
         initializer: "LlamaCppInitializer",
         model_id: str,
+        prompt_format: Optional[str] = None,
         device: Optional[Union[str, int, torch.device]] = None,
         **kwargs,
     ) -> "LlamaCppPipeline":
@@ -188,6 +214,7 @@ def from_initializer(
         return cls(
             model,
             tokenizer,
+            prompt_format,
             device=device,
             **kwargs,
         )
diff --git a/models/text-generation--Qwen1.5-7B-Chat-GGUF.yaml b/models/text-generation--Qwen1.5-7B-Chat-GGUF.yaml
@@ -0,0 +1,37 @@
+deployment_config:
+  autoscaling_config:
+    min_replicas: 0
+    initial_replicas: 1
+    max_replicas: 8
+    target_num_ongoing_requests_per_replica: 1.0
+    metrics_interval_s: 10.0
+    look_back_period_s: 30.0
+    smoothing_factor: 1.0
+    downscale_delay_s: 300.0
+    upscale_delay_s: 90.0
+  ray_actor_options:
+    num_cpus: 0.1    # for a model deployment, we have 3 actor created, 1 and 2 will cost 0.1 cpu, and the model infrence will cost 6(see the setting in the end of the file)
+model_config:
+  warmup: True
+  model_task: text-generation
+  model_id: Qwen/Qwen1.5-7B-Chat-GGUF
+  max_input_words: 128
+  initialization:
+    initializer:
+      type: LlamaCpp
+      model_filename: qwen1_5-7b-chat-q3_k_m.gguf
+      model_init_kwargs:
+        test: true
+    pipeline: llamacpp
+  generation:
+    max_batch_size: 2
+    batch_wait_timeout_s: 0
+    generate_kwargs:
+      max_tokens: 32
+      echo: true
+    prompt_format: '[{{"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate"}},{{"role": "user", "content": "{instruction}"}}]'
+    stopping_sequences: ["\n"]
+scaling_config:
+  num_workers: 1
+  num_gpus_per_worker: 0
+  num_cpus_per_worker: 8   # for inference
diff --git a/models/text-generation--llama-7b-GGUF.yaml b/models/text-generation--llama-7b-GGUF.yaml
@@ -1,6 +1,6 @@
 deployment_config:
   autoscaling_config:
-    min_replicas: 0
+    min_replicas: 1
     initial_replicas: 1
     max_replicas: 8
     target_num_ongoing_requests_per_replica: 1.0
@@ -15,34 +15,25 @@ model_config:
   warmup: True
   model_task: text-generation
   model_id: TheBloke/Llama-2-7B-GGUF
-  max_input_words: 800
+  max_input_words: 128
   initialization:
-    # s3_mirror_config:
-      # endpoint_url: http://39.107.108.170:9000
-      # bucket_uri: /Users/hub/models/llama-2-7b-gguf/
     initializer:
       type: LlamaCpp
-      model_filename: llama-2-7b.Q5_K_S.gguf
+      model_filename: llama-2-7b.Q2_K.gguf
       model_init_kwargs:
         test: true
-
-      # use_kernel: true   # for deepspped type only
-      # max_tokens: 1536   # for deepspped type only
-    # pipeline: defaulttransformers
-    # pipeline: default
     pipeline: llamacpp
   generation:
     max_batch_size: 2
     batch_wait_timeout_s: 0
     generate_kwargs:
-      # do_sample: true
       max_tokens: 128
       temperature: 0.7
       top_p: 0.8
       top_k: 50
       echo: false
-    # prompt_format: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\n{instruction}\n### Response:\n"
-    stopping_sequences: ["### Response:", "### End"]
+    prompt_format: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\n{instruction}\n### Response:\n"
+    #stopping_sequences: ["\n"]
 scaling_config:
   num_workers: 1
   num_gpus_per_worker: 0
diff --git a/setup.py b/setup.py
@@ -51,7 +51,7 @@
             "accelerate==0.25.0",
             "deepspeed==0.14.0",
             "torchmetrics==1.2.1",
-            "llama_cpp_python==0.2.20",
+            "llama_cpp_python==0.2.57",
             "transformers==4.39.1",
         ],
         "vllm": [