add kv cache related keras_nlp patch (#414)

lingzhi98 · web-flow · commit 433b4ce426ef · 2024-08-07T10:08:24.000+08:00
diff --git a/example/gemma/README.md b/example/gemma/README.md
@@ -16,7 +16,13 @@ export KAGGLE_KEY=xxxxxxxx
 Mark `intel-extension-for-openxla` folder as \<WORKSPACE\>, then
 ```bash
 cd <WORKSPACE>/example/gemma/
-pip install keras-nlp==0.10.0 keras==3.3.2
+pip install keras==3.3.2
+git clone https://github.com/keras-team/keras-nlp.git
+cd keras-nlp
+git checkout v0.10.0
+git apply ../keras_nlp.patch
+python setup.py install
+cd ..
 pip install -r ../../test/requirements.txt
 ```
 
diff --git a/example/gemma/keras_nlp.patch b/example/gemma/keras_nlp.patch
@@ -0,0 +1,79 @@
+diff --git a/keras_nlp/models/gemma/gemma_attention.py b/keras_nlp/models/gemma/gemma_attention.py
+index 4b39126..c180752 100644
+--- a/keras_nlp/models/gemma/gemma_attention.py
++++ b/keras_nlp/models/gemma/gemma_attention.py
+@@ -155,15 +155,15 @@ class CachedGemmaAttention(keras.layers.Layer):
+         query = self._apply_rope(query, cache_update_index)
+ 
+         if cache is not None:
+-            key_cache = cache[:, 0, ...]
+-            value_cache = cache[:, 1, ...]
++            key_cache = cache[0]
++            value_cache = cache[1]
+             key_update = self.key_dense(x)
+             key_update = self._apply_rope(key_update, cache_update_index)
+             value_update = self.value_dense(x)
+             start = [0, cache_update_index, 0, 0]
+             key = ops.slice_update(key_cache, start, key_update)
+             value = ops.slice_update(value_cache, start, value_update)
+-            cache = ops.stack((key, value), axis=1)
++            cache = [key, value]
+         else:
+             key = self.key_dense(x)
+             key = self._apply_rope(key, cache_update_index)
+diff --git a/keras_nlp/models/gemma/gemma_causal_lm.py b/keras_nlp/models/gemma/gemma_causal_lm.py
+index 26e9aad..d29238c 100644
+--- a/keras_nlp/models/gemma/gemma_causal_lm.py
++++ b/keras_nlp/models/gemma/gemma_causal_lm.py
+@@ -215,17 +215,17 @@ class GemmaCausalLM(CausalLM):
+         # Each decoder layer has a cache; we update them separately.
+         caches = []
+         for i, transformer_layer in enumerate(self.backbone.transformer_layers):
+-            current_cache = cache[:, i, ...]
++            current_cache = cache[i]
+             x, next_cache = transformer_layer(
+                 x,
+                 cache=current_cache,
+                 cache_update_index=cache_update_index,
+             )
+             caches.append(next_cache)
+-        cache = ops.stack(caches, axis=1)
++
+         hidden_states = x = self.backbone.layer_norm(x)
+         logits = self.backbone.token_embedding(x, reverse=True)
+-        return logits, hidden_states, cache
++        return logits, hidden_states, caches
+ 
+     def _build_cache(self, token_ids):
+         """Build an empty cache for use with `call_with_cache()`."""
+@@ -234,11 +234,13 @@ class GemmaCausalLM(CausalLM):
+         num_layers = self.backbone.num_layers
+         num_heads = self.backbone.num_key_value_heads
+         head_dim = self.backbone.head_dim
+-        shape = [batch_size, num_layers, 2, max_length, num_heads, head_dim]
+-        cache = ops.zeros(shape, dtype=self.compute_dtype)
++        shape = [batch_size, max_length, num_heads, head_dim]
++        cache_list = []
++        for _ in range(0, num_layers):
++          cache_list.append([ops.zeros(shape, dtype=self.compute_dtype), ops.zeros(shape, dtype=self.compute_dtype)])
+         # Seed the cache.
+-        _, hidden_states, cache = self.call_with_cache(token_ids, cache, 0)
+-        return hidden_states, cache
++        _, hidden_states, cache_list = self.call_with_cache(token_ids, cache_list, 0)
++        return hidden_states, cache_list
+ 
+     def generate_step(
+         self,
+diff --git a/keras_nlp/models/gemma/gemma_decoder_block.py b/keras_nlp/models/gemma/gemma_decoder_block.py
+index 0a91655..3ae7f8a 100644
+--- a/keras_nlp/models/gemma/gemma_decoder_block.py
++++ b/keras_nlp/models/gemma/gemma_decoder_block.py
+@@ -117,7 +117,7 @@ class GemmaDecoderBlock(keras.layers.Layer):
+         batch_size = ops.shape(x)[0]
+         input_length = output_length = ops.shape(x)[1]
+         if cache is not None:
+-            input_length = ops.shape(cache)[2]
++            input_length = ops.shape(cache[0])[1]
+ 
+         causal_mask = compute_causal_mask(
+             batch_size=batch_size,