KV cache creation as a separate func (#53)

MasterJH5574 · web-flow · commit cb76f4119eec · 2023-04-21T22:52:04.000-04:00
This PR changes the creation of KV cache so that it becomes a VM
function that will be invoked before generation.
diff --git a/build.py b/build.py
@@ -81,8 +81,8 @@ def get_models(config, model):
     if "vicuna" in model or "llama" in model:
         bb = relax.BlockBuilder()
         llama.create_encoding_func(bb, config)
-        llama.create_encoding_func_without_cache(bb, config)
         llama.create_decoding_func(bb, config)
+        llama.create_kv_cache_func(bb, config)
         mod = bb.get()
 
         for gv in mod.functions:
@@ -121,7 +121,7 @@ def mod_transform_before_build(
     mod: tvm.IRModule, model_params: List[tvm.nd.NDArray], args: Dict
 ) -> tvm.IRModule:
     """First-stage: Legalize ops and trace"""
-    model_names = ["encoding", "decoding", "encoding_without_cache"]
+    model_names = ["encoding", "decoding", "create_kv_cache"]
 
     mod = web_llm.transform.GroupQuantize(group_size=32, sym=False)(mod)
     mod = web_llm.transform.FuseTransposeMatmul()(mod)
diff --git a/chat.py b/chat.py
@@ -145,21 +145,10 @@ def get_tvm_model(args):
     vm = relax.VirtualMachine(ex, device)
 
     class Model:
-        def new_cache(self):
-            fcreate_cache = tvm.get_global_func("vm.builtin.attention_kv_cache_create")
-            self.kv_cache = []
-            for i in range(64):  # num_layer
-                kv_cache = fcreate_cache(
-                    tvm.nd.empty((1, 32, 128), device=device, dtype="float32"),
-                    tvm.runtime.ShapeTuple([32, 32, 128]),
-                    0
-                )
-                self.kv_cache.append(kv_cache)
 
         def __init__(self) -> None:
-            self.kv_cache = None
             self.tot_seq_len = 0
-            self.new_cache()
+            self.kv_cache = vm["create_kv_cache"]()
 
         def forward(
             self, inputs: torch.Tensor
diff --git a/web/llm_chat.js b/web/llm_chat.js
@@ -145,30 +145,16 @@ class LLMChatPipeline {
     this.decoding = this.tvm.detachFromCurrentScope(
       this.vm.getFunction("decoding")
     );
-    this.encodingWithoutCache = this.tvm.detachFromCurrentScope(
-      this.vm.getFunction("encoding_without_cache")
-    );
     this.params = this.tvm.detachFromCurrentScope(
       this.tvm.getParamsFromCache("param", cacheMetadata.ParamSize)
     );
-    const fcreateCache = this.tvm.getGlobalFunc("vm.builtin.attention_kv_cache_create");
+    const fcreateCache = this.vm.getFunction("create_kv_cache");
     this.fclearKVCaches = this.tvm.detachFromCurrentScope(
       this.tvm.getGlobalFunc("vm.builtin.attention_kv_cache_array_clear")
     );
 
     // use extern config for now
-    // move to kv generation vm function
-    const kvList = [];
-    const kvConfig = config.kvConfig;
-    for (let i = 0; i < kvConfig.numLayers; ++i) {
-      const item = fcreateCache(
-        this.tvm.empty(kvConfig.shape, kvConfig.dtype, this.device),
-        this.tvm.makeShapeTuple(kvConfig.shape),
-        this.tvm.scalar(0, "int")
-      );
-      kvList.push(item);
-    }
-    this.kvCache = this.tvm.detachFromCurrentScope(this.tvm.makeTVMArray(kvList));
+    this.kvCache = this.tvm.detachFromCurrentScope(fcreateCache());
     // fill with pad token
     this.logitsOnCPU = undefined;
 
@@ -180,7 +166,6 @@ class LLMChatPipeline {
   dispose() {
     // note: tvm instance is not owned by this class
     this.params.dispose();
-    this.encodingWithoutCache.dispose();
     this.decoding.dispose();
     this.encoding.dispose();
     this.vm.dispose();
@@ -368,7 +353,7 @@ class LLMChatPipeline {
   }
 
   async evaluate() {
-    // run a canonicla evaluateion fo the flow
+    // run a canonical evaluation of the flow
     this.#clearKVCache();
     const testPrompt = "The capital of Canada is";
     const ids = await this.tokenizer.encodeIds(testPrompt);
diff --git a/web_llm/relax_model/llama.py b/web_llm/relax_model/llama.py
@@ -178,7 +178,7 @@ def forward(
         cos_cached: relax.Expr,
         sin_cached: relax.Expr,
         all_seq_len_shape: relax.Expr,
-        past_key_value: Optional[Tuple[relax.Expr]] = None,
+        past_key_value: Tuple[relax.Expr],
         attention_mask: Optional[relax.Expr] = None,
     ) -> Tuple[relax.Expr, Optional[relax.Expr], Optional[Tuple[relax.Expr]]]:
         from tvm.relax.op import astype, matmul, maximum, permute_dims, reshape, squeeze
@@ -221,43 +221,43 @@ def forward(
             [kv_states_shape[0], kv_seq_len, kv_states_shape[2], kv_states_shape[3]]
         )
         kv_cache_shape = R.shape([kv_seq_len, kv_states_shape[2], kv_states_shape[3]])
-        if past_key_value is not None:
-            squeezed_key = nn.emit(squeeze(key_states, axis=0))
-            squeezed_value = nn.emit(squeeze(value_states, axis=0))
-            k_cache, v_cache = past_key_value
-            f_kv_cache_append = relax.extern("vm.builtin.attention_kv_cache_append")
-            k_cache = nn.emit(
-                relax.Call(
-                    f_kv_cache_append,
-                    args=[k_cache, squeezed_key],
-                    sinfo_args=[relax.ObjectStructInfo()],
-                )
+
+        squeezed_key = nn.emit(squeeze(key_states, axis=0))
+        squeezed_value = nn.emit(squeeze(value_states, axis=0))
+        k_cache, v_cache = past_key_value
+        f_kv_cache_append = relax.extern("vm.builtin.attention_kv_cache_append")
+        k_cache = nn.emit(
+            relax.Call(
+                f_kv_cache_append,
+                args=[k_cache, squeezed_key],
+                sinfo_args=[relax.ObjectStructInfo()],
             )
-            v_cache = nn.emit(
-                relax.Call(
-                    f_kv_cache_append,
-                    args=[v_cache, squeezed_value],
-                    sinfo_args=[relax.ObjectStructInfo()],
-                )
+        )
+        v_cache = nn.emit(
+            relax.Call(
+                f_kv_cache_append,
+                args=[v_cache, squeezed_value],
+                sinfo_args=[relax.ObjectStructInfo()],
             )
-            past_key_value = (k_cache, v_cache)
-            f_kv_cache_view = relax.extern("vm.builtin.attention_kv_cache_view")
-            k_cache = nn.emit(
-                relax.Call(
-                    f_kv_cache_view,
-                    args=[k_cache, kv_cache_shape],
-                    sinfo_args=[R.Tensor(kv_cache_shape, kv_states_dtype)],
-                )
+        )
+        past_key_value = (k_cache, v_cache)
+        f_kv_cache_view = relax.extern("vm.builtin.attention_kv_cache_view")
+        k_cache = nn.emit(
+            relax.Call(
+                f_kv_cache_view,
+                args=[k_cache, kv_cache_shape],
+                sinfo_args=[R.Tensor(kv_cache_shape, kv_states_dtype)],
             )
-            v_cache = nn.emit(
-                relax.Call(
-                    f_kv_cache_view,
-                    args=[v_cache, kv_cache_shape],
-                    sinfo_args=[R.Tensor(kv_cache_shape, kv_states_dtype)],
-                )
+        )
+        v_cache = nn.emit(
+            relax.Call(
+                f_kv_cache_view,
+                args=[v_cache, kv_cache_shape],
+                sinfo_args=[R.Tensor(kv_cache_shape, kv_states_dtype)],
             )
-            key_states = nn.emit(reshape(k_cache, kv_states_shape))
-            value_states = nn.emit(reshape(v_cache, kv_states_shape))
+        )
+        key_states = nn.emit(reshape(k_cache, kv_states_shape))
+        value_states = nn.emit(reshape(v_cache, kv_states_shape))
 
         query_states = nn.emit(permute_dims(query_states, [0, 2, 1, 3]))
         key_states = nn.emit(permute_dims(key_states, [0, 2, 1, 3]))
@@ -333,8 +333,8 @@ def forward(
         cos_cached: relax.Expr,
         sin_cached: relax.Expr,
         all_seq_len_shape: relax.Expr,
+        past_key_value: Tuple[relax.Expr],
         attention_mask: Optional[relax.Expr] = None,
-        past_key_value: Optional[Tuple[relax.Expr]] = None,
     ) -> Tuple[relax.Expr, Optional[Tuple[relax.Expr, relax.Expr]]]:
         residual = hidden_states
 
@@ -402,7 +402,7 @@ def forward(
         cos_cached: relax.Expr,
         sin_cached: relax.Expr,
         all_seq_len_shape: relax.Expr,
-        past_key_values: Optional[relax.Expr] = None,
+        past_key_values: relax.Expr,
     ):
         # retrieve input_ids
         batch_size, seq_length = input_ids.struct_info.shape
@@ -421,11 +421,8 @@ def forward(
         next_decoder_cache = ()
 
         for idx, decoder_layer in enumerate(self.layers):
-            past_key_value = (
-                (past_key_values[idx * 2], past_key_values[idx * 2 + 1])
-                if past_key_values is not None
-                else None
-            )
+            assert past_key_values is not None
+            past_key_value = (past_key_values[idx * 2], past_key_values[idx * 2 + 1])
 
             hidden_states, key_value_cache = decoder_layer(
                 hidden_states,
@@ -459,7 +456,7 @@ def forward(
         self,
         input_ids: relax.Expr,
         all_seq_len_shape: relax.Expr,
-        past_key_values: Optional[List[relax.Expr]] = None,
+        past_key_values: relax.Expr,
     ):
         hidden_states, key_value_cache = self.model(
             input_ids=input_ids,
@@ -543,20 +540,24 @@ def create_decoding_func(bb: relax.BlockBuilder, config: LlamaConfig) -> None:
     bb.update_func(gv, mod[gv].with_attr("num_input", 3))
 
 
-def create_encoding_func_without_cache(bb: relax.BlockBuilder, config: LlamaConfig) -> None:
-    bsz = 1
-    seq_len = tvm.tir.Var("n", "int64")
-
-    with bb.function("encoding_without_cache"):
-        model = LlamaForCausalLM(config)
-        input_ids = nn.Placeholder((bsz, seq_len), dtype="int32", name="input_ids")
-        all_seq_len_shape = relax.Var("all_seq_len", relax.ShapeStructInfo((seq_len,)))
+def create_kv_cache_func(bb: relax.BlockBuilder, config: LlamaConfig) -> None:
+    init_shape = relax.ShapeExpr(
+        (1, config.num_attention_heads, config.hidden_size // config.num_attention_heads)
+    )
+    with bb.function("create_kv_cache", []):
         with bb.dataflow():
-            logits, _ = model(input_ids, all_seq_len_shape)
-            params = [input_ids, all_seq_len_shape] + model.parameters()
-            gv = bb.emit_output(logits)
-        bb.emit_func_output(gv, params)
-
-    mod = bb.get()
-    gv = mod.get_global_var("encoding_without_cache")
-    bb.update_func(gv, mod[gv].with_attr("num_input", 2))
+            zeros = bb.emit(relax.op.zeros(init_shape, "float32"))
+            caches = []
+            f_kv_cache_create = relax.extern("vm.builtin.attention_kv_cache_create")
+            for _ in range(config.num_hidden_layers * 2):
+                caches.append(
+                    bb.emit(
+                        relax.Call(
+                            f_kv_cache_create,
+                            args=[zeros, init_shape, relax.PrimValue(0)],
+                            sinfo_args=[relax.ObjectStructInfo()],
+                        )
+                    )
+                )
+            gv = bb.emit_output(caches)
+        bb.emit_func_output(gv)