Adding commandr

edk208 · edk208 · commit b261ccf2d21a · 2024-06-27T12:45:15.000-04:00
diff --git a/llm_exl2_dynamic_gen.py b/llm_exl2_dynamic_gen.py
@@ -34,14 +34,14 @@
 import queue
 import uvicorn
 from io import StringIO
-from util import format_prompt_llama3, format_prompt, format_prompt_tess
+from util import format_prompt_llama3, format_prompt, format_prompt_tess, format_prompt_commandr
 from util_merge import ExLlamaV2MergePassthrough
 
 def generate_unique_id():
     return uuid.uuid4()
 
 # This is a demo and small stress to showcase some of the features of the dynamic batching generator.
-repo_str = 'tess-xl-exl2-speculative'
+repo_str = 'commandr-exl2'
 
 class CompletionRequest(BaseModel):
     model: str
@@ -205,7 +205,7 @@ def display(self):
 total_context = 32768
 
 # Max individual context
-max_context = 12288
+max_context = 8192 
 
 # N-gram or draft model speculative decoding. Largely detrimental to performance at higher batch sizes.
 use_ngram = False
@@ -215,7 +215,7 @@ def display(self):
     draft_model_dir = specrepo_id
 
 # Max number of batches to run at once, assuming the sequences will fit within total_context.
-max_batch_size = 6 if paged else 1
+max_batch_size = 4 if paged else 1
 
 # Max chunk size. Determines the size of prefill operations. Can be reduced to reduce pauses whenever a
 # new job is started, but at the expense of overall prompt ingestion speed.
@@ -267,22 +267,22 @@ def display(self):
 config.max_input_len = max_chunk_size
 config.max_attention_size = max_chunk_size ** 2
 
-ropescale = 2.5
-config.scale_alpha_value = ropescale
+#ropescale = 2.5
+#config.scale_alpha_value = ropescale
 config.max_seq_len = max_context
 model = ExLlamaV2(config)
 
 # Configure the cache. The dynamic generator expects a batch size of 1 and a max_seq_len equal to
 # the total number of cached tokens. The flat cache will be split dynamically
 
-#cache = ExLlamaV2Cache(
-#    model,
-#    max_seq_len = total_context,
-    #lazy = True
-#)
+cache = ExLlamaV2Cache_Q4(
+    model,
+    max_seq_len = total_context,
+    lazy = True
+)
 
-#model.load_autosplit(cache, progress = True)
-model.load([16,18,18,20])
+model.load_autosplit(cache, progress = True)
+#model.load([16,18,18,20])
 # Also, tokenizer
 
 print("Loading tokenizer...")
@@ -296,11 +296,11 @@ def display(self):
 #lora = ExLlamaV2Lora.from_directory(model, lora_directory)
 lora = None
 
-cache = ExLlamaV2Cache_Q4(
-    model,
-    max_seq_len = total_context,
+#cache = ExLlamaV2Cache_Q4(
+#    model,
+#    max_seq_len = total_context,
     #lazy = True
-)
+#)
 
 # Initialize the generator
 
@@ -574,6 +574,8 @@ async def mainchat(request: ChatCompletionRequest):
             prompt = await format_prompt_tess(request.messages)
         elif repo_str == 'tinyllama-exl2-speculative':
             prompt = await format_prompt_zephyr(request.messages)
+        elif repo_str == 'commandr-exl2' or repo_str == 'commandr-exl2-speculative':
+            prompt = await format_prompt_commandr(request.messages)
         else:
             prompt = await format_prompt(request.messages)
         status_area.update(f"Prompt: {prompt}")
diff --git a/util.py b/util.py
@@ -148,4 +148,31 @@ async def format_prompt_mixtral(messages):
                 formatted_prompt += f"[INST] {message.content} [/INST] "
         elif message.role == "assistant":
             formatted_prompt += f" {message.content}</s> "  # Prep for user follow-up
-    return formatted_prompt
+    return formatted_prompt
+
+async def format_prompt_commandr(messages):
+    formatted_prompt = ""
+    system_message_found = False
+    
+    # Check for a system message first
+    for message in messages:
+        if message.role == "system":
+            system_message_found = True
+            break
+    
+    # If no system message was found, prepend a default one
+    if not system_message_found:
+        formatted_prompt += f"<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+ 
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+        elif message.role == "user":
+            formatted_prompt += f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+        elif message.role == "assistant":
+            formatted_prompt += f"<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+    return formatted_prompt
+
+