Repetition penalty (#100)

junrushao · web-flow · commit 383bc52971e7 · 2023-05-18T19:00:03.000-07:00
diff --git a/web/llm_chat.js b/web/llm_chat.js
@@ -69,7 +69,7 @@ class Conversation {
       throw Error("needs to call getPromptArray for the first message");
     }
     if (this.separator_style == "Two") {
-      let ret = [this.seps[this.seps.length - 1]];
+      let ret = [];
       for (let i = this.messages.length - 2; i < this.messages.length; ++i) {
         const item = this.messages[i];
         const role = item[0];
@@ -167,6 +167,8 @@ class LLMChatPipeline {
 
     this.temperature = config.temperature;
     this.top_p = config.top_p;
+    this.repetitionPenalty = config.repetition_penalty
+    this.appeared_tokens = new Set();
 
     this.meanGenLength = config.mean_gen_len;
     this.streamInterval = 1;
@@ -268,7 +270,16 @@ class LLMChatPipeline {
     this.#updateLogitsOnCPU(logits);
     this.tvm.endScope();
     await this.device.sync();
-    return this.tvm.sampleTopPFromLogits(this.logitsOnCPU, temperature, top_p);
+    if (this.repetitionPenalty < 1.0 + 1e-6) {
+      return this.tvm.sampleTopPFromLogits(this.logitsOnCPU, temperature, top_p);
+    } else {
+      this.tvm.beginScope();
+      var appeared_tokens_ndarray = this.tvm.empty([1, this.appeared_tokens.size], "int32", this.tvm.cpu());
+      appeared_tokens_ndarray.copyFrom(Array.from(this.appeared_tokens));
+      this.tvm.applyRepetitionPenalty(this.logitsOnCPU, appeared_tokens_ndarray, this.repetitionPenalty);
+      this.tvm.endScope();
+      return this.tvm.sampleTopPFromLogits(this.logitsOnCPU, temperature, top_p);
+    }
   }
 
   async getInputTokens() {
@@ -360,6 +371,7 @@ class LLMChatPipeline {
       throw Error("Too small window size config");
     }
     let step = 0;
+    var stop = false;
     for (; step < maxGenLen && this.kvCacheLength + inputTokenLength + step < this.maxWindowLength; ++step) {
       this.tvm.beginScope();
       var inputData;
@@ -375,22 +387,26 @@ class LLMChatPipeline {
         this.#forward(inputData, this.kvCacheLength + inputTokenLength + step)
       );
       this.tvm.endScope();
+      if (stop) {
+        break;
+      }
 
       const nextToken = await this.sampleTokenFromLogits(logits, this.temperature, this.top_p);
       logits.dispose();
 
       tokens.push(nextToken);
+      this.appeared_tokens.add(nextToken);
       const outputTokens = tokens.slice(inputTokenLength);
       outputPrompt = this.tokenizer.decode(outputTokens);
 
       if (this.stopTokens.includes(nextToken)) {
-        break;
+        stop = true;
       }
 
       const stopPos = outputPrompt.lastIndexOf(stopStr);
       if (stopPos != -1) {
         outputPrompt = outputPrompt.substring(0, stopPos);
-        break;
+        stop = true;
       }
       let tend = performance.now();
       if (step != 0) {
@@ -405,7 +421,7 @@ class LLMChatPipeline {
         callbackUpdateResponse(step, outputPrompt);
       }
     }
-    this.kvCacheLength += tokens.length - 1;
+    this.kvCacheLength += tokens.length;
     this.conversation.messages[this.conversation.messages.length - 1][1] = outputPrompt;
     return outputPrompt;
   }