mlc-ai
diff --git a/‎build.py
Lines changed: 1 addition & 1 deletion b/‎build.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎chat.py
Lines changed: 14 additions & 10 deletions b/‎chat.py
Lines changed: 14 additions & 10 deletions
diff --git a/‎web/gh-page-config.json
Lines changed: 1 addition & 1 deletion b/‎web/gh-page-config.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎web/llm_chat.js
Lines changed: 112 additions & 34 deletions b/‎web/llm_chat.js
Lines changed: 112 additions & 34 deletions
diff --git a/‎web/local-config.json
Lines changed: 3 additions & 3 deletions b/‎web/local-config.json
Lines changed: 3 additions & 3 deletions
@@ -88,7 +88,7 @@ def get_models(config, model):
         for gv in mod.functions:
             func = mod[gv]
             if isinstance(func, relax.Function):
-                mod[gv] = func.with_attr("tir_var_upper_bound", {"n": config.max_sequence_length})
+                mod[gv] = func.with_attr("tir_var_upper_bound", {"n": config.max_sequence_length, "m": config.max_sequence_length})
 
         return mod
     else:
 
@@ -17,7 +17,7 @@ def _parse_args():
     args.add_argument("--debug-dump", action="store_true", default=False)
     args.add_argument("--artifact-path", type=str, default="dist")
     args.add_argument("--model", type=str, default="vicuna-7b-v1")
-    args.add_argument("--max-gen-len", type=int, default=128)
+    args.add_argument("--max-gen-len", type=int, default=2048)
     args.add_argument("--run-torch-model", action="store_true", default=False)
     parsed = args.parse_args()
     parsed.model_path = os.path.join(parsed.artifact_path, "models", parsed.model)
@@ -46,9 +46,11 @@ def generate(
         top_p: float = 0.95,
         stream_interval: int = 2,
         stop_str: str = None,
+        add_bos = True,
     ):
         prompt_tokens = self.tokenizer.encode(prompt)
-
+        if not add_bos:
+            prompt_tokens = prompt_tokens[1:]
         total_len = max_gen_len + len(prompt_tokens)
         tokens = torch.full((1, total_len), self.tokenizer.pad_token_id).to(
             torch.int32
@@ -57,9 +59,9 @@ def generate(
         start_pos = len(prompt_tokens)
         for cur_pos in range(start_pos, total_len):
             if cur_pos == start_pos:
-                logits = self.model(tokens[:, :cur_pos], cur_pos, clear_cache=True)
+                logits = self.model(tokens[:, :cur_pos])
             else:
-                logits = self.model(tokens[:, cur_pos - 1 : cur_pos], cur_pos)
+                logits = self.model(tokens[:, cur_pos - 1 : cur_pos])
             logits = logits[:, -1, :]
             if temperature > 0:
                 probs = torch.softmax(logits / temperature, dim=-1)
@@ -102,6 +104,7 @@ def chat(model_wrapper, args):
 
     # Chat
     conv = conv_templates["vicuna_v1.1"].copy()
+    add_bos = True
     while True:
         try:
             inp = input(f"{conv.roles[0]}: ")
@@ -113,14 +116,14 @@ def chat(model_wrapper, args):
 
         conv.append_message(conv.roles[0], inp)
         conv.append_message(conv.roles[1], None)
-        prompt = conv.get_prompt()
-
+        prompt = conv.get_prompt_unprocessed()
         print(f"{conv.roles[1]}: ", end="", flush=True)
         pre = 0
         for outputs in model_wrapper.generate(
             prompt,
             args.max_gen_len,
             stop_str=conv.sep if conv.sep_style == SeparatorStyle.SINGLE else conv.sep2,
+            add_bos = add_bos,
         ):
             outputs = outputs[len(prompt) + 1 :].strip()
             outputs = outputs.split(" ")
@@ -131,6 +134,7 @@ def chat(model_wrapper, args):
         print(" ".join(outputs[pre:]), flush=True)
 
         conv.messages[-1][-1] = " ".join(outputs)
+        add_bos = False
         print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
 
 
@@ -154,15 +158,15 @@ def new_cache(self):
 
         def __init__(self) -> None:
             self.kv_cache = None
+            self.tot_seq_len = 0
             self.new_cache()
 
         def forward(
-            self, inputs: torch.Tensor, cur_pos: int, clear_cache: bool = False
+            self, inputs: torch.Tensor
         ) -> torch.Tensor:
-            if clear_cache:
-                self.new_cache()
             inputs = tvm.nd.array(inputs.numpy(), device=device)
-            seq_len_shape = tvm.runtime.ShapeTuple([cur_pos])
+            self.tot_seq_len+=inputs.shape[1]
+            seq_len_shape = tvm.runtime.ShapeTuple([self.tot_seq_len])
             if inputs.shape[1] > 1:
                 logits, kv_cache = vm["encoding"](
                     inputs, seq_len_shape, self.kv_cache, const_params
 
@@ -5,7 +5,7 @@
         "dtype": "float32"
     },
     "wasmUrl": "dist/vicuna-7b-v1/vicuna-7b-v1_webgpu.wasm",
-    "cacheUrl": "https://huggingface.co/mlc-ai/web-lm/resolve/main/vicuna-0b/",
+    "cacheUrl": "https://huggingface.co/mlc-ai/web-lm/resolve/main/vicuna-7b-v1/",
     "tokenizer": "dist/vicuna-7b-v1/tokenizer.model",
     "maxGenLength": 512,
     "meanGenLength": 128,
 
@@ -36,6 +36,57 @@ class Conversation {
     return ret;
   }
 
+  /**
+   * Get prompt arrays that has not been fed as input
+   *
+   * @returns The prompt array.
+   */
+  getPromptArrayUnproccessed() {
+    if (this.seps.length == 0) {
+      throw Error("Need seps to work")
+    }
+    if (this.messages.length < 3) {
+      throw Error("needs to call getLastPromptArray for the first message");
+    }
+    let ret = [this.seps[this.seps.length - 1]];
+    for (let i = this.messages.length - 2; i < this.messages.length; ++i) {
+      const item = this.messages[i];
+      const role = item[0];
+      const message = item[1];
+      if (message !== undefined && message != "") {
+        ret.push(role + ": " + message + this.seps[i % this.seps.length]);
+      } else {
+        ret.push(role + ":");
+      }
+    }
+    return ret;
+
+  }
+
+  /**
+   * Get last prompt array with prefix as system.
+   *
+   * @returns The prompt array.
+   */
+  getLastPromptArray() {
+    if (this.seps.length == 0) {
+      throw Error("Need seps to work")
+    }
+    let ret = [this.system + this.seps[0]];
+
+    for (let i = this.messages.length - 2; i < this.messages.length; ++i) {
+      const item = this.messages[i];
+      const role = item[0];
+      const message = item[1];
+      if (message !== undefined && message != "") {
+        ret.push(role + ": " + message + this.seps[i % this.seps.length]);
+      } else {
+        ret.push(role + ":");
+      }
+    }
+    return ret;
+  }
+
   reset() {
     this.messages = [];
   }
@@ -52,12 +103,12 @@ class Conversation {
 function defaultConversation(maxWindowLength = 512) {
   return new Conversation({
     system: "A chat between a curious user and an artificial intelligence assistant. " +
-            "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+      "The assistant gives helpful, detailed, and polite answers to the user's questions.",
     roles: ["USER", "ASSISTANT"],
     maxWindowLength: maxWindowLength,
     messages: [],
     offset: 0,
-    seps:[" ", "</s>"],
+    seps: [" ", "</s>"],
   });
 };
 
@@ -120,6 +171,9 @@ class LLMChatPipeline {
     this.kvCache = this.tvm.detachFromCurrentScope(this.tvm.makeTVMArray(kvList));
     // fill with pad token
     this.logitsOnCPU = undefined;
+
+    this.kvCacheLength = 0;
+    this.clearCache = true
   }
 
 
@@ -167,7 +221,7 @@ class LLMChatPipeline {
         this.tvm.empty(logits.shape, logits.dtype, this.tvm.cpu())
       );
     } else {
-      if(logits.shape[0] != this.logitsOnCPU.shape[0]) {
+      if (logits.shape[0] != this.logitsOnCPU.shape[0]) {
         throw Error("We expect the size of logits to remain unchanged");
       }
     }
@@ -183,35 +237,56 @@ class LLMChatPipeline {
   }
 
   async getInputTokens() {
-    const tokens = [this.bosTokenId];
-    const prompts = this.conversation.getPromptArray();
+    let tokens = [this.bosTokenId];
+    let prompts = ""
+    if (this.conversation.messages.length <= 2) {
+      prompts = this.conversation.getPromptArray();
+    } else {
+      tokens.pop();
+      prompts = this.conversation.getPromptArrayUnproccessed();
+    }
     tokens.push(...await this.tokenizer.encodeIds(prompts[0]));
-
     let ctxLength = tokens.length;
-    const context = [];
+    let context = [];
+    let need_shift_window = false;
     for (let i = prompts.length - 1; i > 0; --i) {
       const encoded = this.tokenizer.encodeIds(prompts[i]);
       ctxLength += encoded.length;
-      if (ctxLength + this.meanGenLength >= this.maxWindowLength && i + 2 < prompts.length) {
-        this.logger("Shift window at " + i);
+      if (this.kvCacheLength + ctxLength + this.meanGenLength >= this.maxWindowLength) {
+        need_shift_window = true;
         break;
       }
       context.unshift(encoded);
     }
-    const followMessage = [];
-    for (const ctx of context) {
-      followMessage.push(...ctx);
+    if (!need_shift_window) {
+      for (const ctx of context) {
+        tokens.push(...ctx);
+      }
+      return tokens;
     }
-
-    if (followMessage.length + tokens.length + this.meanGenLength >= this.maxWindowLength) {
-      const maxMsgLen = this.maxWindowLength - tokens.length - this.meanGenLength;
-      if (maxMsgLen < this.meanGenLength) {
-        throw Error("Too small window config tokens.length=" + tokens.length);
+    // need shift window and re-encode
+    this.logger("need shift window")
+    this.kvCacheLength = 0;
+    this.clearCache = true;
+    // abandon all tokens we collected
+    tokens = [this.bosTokenId]
+    let all_prompts = this.conversation.getPromptArray();
+    tokens.push(...await this.tokenizer.encodeIds(all_prompts[0]));
+    context = [];
+    ctxLength = tokens.length;
+    //only keep 10% of the window context
+    const fill_factor = 0.1
+    for (let i = all_prompts.length - 1; i > 0; --i) {
+      const encoded = this.tokenizer.encodeIds(all_prompts[i]);
+      ctxLength += encoded.length;
+      if (ctxLength >= fill_factor * this.maxWindowLength && i + 2 < all_prompts.length) {
+        break;
       }
-      this.logger("Slice message " + followMessage.length + " to " + maxMsgLen);
-      followMessage = followMessage.slice(followMessage.length - maxMsgLen);
+      context.unshift(encoded);
+    }
+    for (const ctx of context) {
+      tokens.push(...ctx);
     }
-    tokens.push(...followMessage);
     if (tokens.length + this.meanGenLength >= this.maxWindowLength) {
       throw Error("Exceed max window length curr=" + tokens.length);
     }
@@ -235,16 +310,18 @@ class LLMChatPipeline {
     const inputTokenLength = tokens.length;
 
     var outputPrompt = "";
-    this.#clearKVCache();
+    if (this.clearCache) {
+      this.#clearKVCache();
+      this.clearCache = false;
+    }
     const maxGenLen = Math.min(this.maxGenLength, this.maxWindowLength - tokens.length);
     if (maxGenLen < this.meanGenLength) {
       throw Error("Too small window size config");
     }
-
-    for (let step = 0; step < maxGenLen; ++step) {
+    let step = 0;
+    for (; step < maxGenLen && this.kvCacheLength + inputTokenLength + step < this.maxWindowLength; ++step) {
       this.tvm.beginScope();
       var inputData;
-
       let tstart = performance.now();
       if (step == 0) {
         inputData = this.tvm.empty([1, tokens.length], "int32", this.device);
@@ -254,7 +331,7 @@ class LLMChatPipeline {
         inputData.copyFrom(tokens.slice(tokens.length - 1));
       }
       const logits = this.tvm.detachFromCurrentScope(
-        this.#forward(inputData, inputTokenLength + step)
+        this.#forward(inputData, this.kvCacheLength + inputTokenLength + step)
       );
       this.tvm.endScope();
 
@@ -285,6 +362,7 @@ class LLMChatPipeline {
         callbackUpdateResponse(step, outputPrompt);
       }
     }
+    this.kvCacheLength += tokens.length - 1;
     this.conversation.messages[this.conversation.messages.length - 1][1] = outputPrompt;
     return outputPrompt;
   }
@@ -358,12 +436,12 @@ class LLMChatInstance {
     this.logger = console.log;
     this.debugTest = false;
   }
- /**
-   * Initialize TVM
-   * @param wasmUrl URL to wasm source.
-   * @param cacheUrl URL to NDArray cache.
-   * @param logger Custom logger.
-   */
+  /**
+    * Initialize TVM
+    * @param wasmUrl URL to wasm source.
+    * @param cacheUrl URL to NDArray cache.
+    * @param logger Custom logger.
+    */
   async #asyncInitTVM(wasmUrl, cacheUrl) {
     if (this.tvm !== undefined) {
       return;
@@ -395,7 +473,7 @@ class LLMChatInstance {
         this.reset();
         throw Error("This browser env do not support WebGPU");
       }
-    } catch(err) {
+    } catch (err) {
       this.appendMessage("error", "Find an error initializing the WebGPU device " + err.toString());
       console.log(err.stack);
       this.reset();
@@ -444,7 +522,7 @@ class LLMChatInstance {
     // initialize UX and tokenizer
     const tokenizer = await tvmjsGlobalEnv.sentencePieceProcessor(this.config.tokenizer);
     this.pipeline = this.tvm.withNewScope(() => {
-      return new LLMChatPipeline(this.tvm, tokenizer,  this.tvm.cacheMetadata, this.config);
+      return new LLMChatPipeline(this.tvm, tokenizer, this.tvm.cacheMetadata, this.config);
     });
     await this.pipeline.asyncLoadWebGPUPiplines();
     this.updateLastMessage("init", "All initialization finished.");
@@ -521,7 +599,7 @@ class LLMChatInstance {
 
     try {
       await this.asyncInit();
-    } catch(err) {
+    } catch (err) {
       this.appendMessage("error", "Init error, " + err.toString());
       console.log(err.stack);
       this.reset();
 
@@ -7,7 +7,7 @@
     "wasmUrl": "dist/vicuna-7b-v1/vicuna-7b-v1_webgpu.wasm",
     "cacheUrl": "vicuna-7b-v1-params/",
     "tokenizer": "dist/vicuna-7b-v1/tokenizer.model",
-    "maxGenLength": 512,
-    "meanGenLength": 128,
-    "maxWindowLength": 1024
+    "maxGenLength": 1024,
+    "meanGenLength": 256,
+    "maxWindowLength": 2048
 }