[llama-mm] Add torch.cond to replace if condition in MHA

larryliu0820 · larryliu0820 · commit 4c138ef1447c · 2024-11-14T14:37:32.000-08:00
Summary:

In torchtune's MultiHeadAttention we have this logic:

If `y` is not None, calculate the values of `k` and `v` from y and
update the KVCache.

Otherwise (if `y` is None), retrieve the value of `k` and `v` from
KVCache.

This logic is not able to be handled by export world. Here I'm proposing
a rewrite:

If `y` does not have all values equal to nan (not a number), calculate
the values of `k` and `v` from `y` and update the KVCache.

Otherwise (if all of the values of `y` are nan), retrieve the value of
`k` and `v` from KVCache.

This rewrite allows the module to satisfy the requirement of
`torch.cond` and avoid specialization:
* The operands to `torch.cond` should have the same shape for the true
  branch and the false branch.

This means we will have to change this logic in torchtune:

```
        if encoder_input is not None:
            encoder_embed = self.encoder(**encoder_input)

        output = self.decoder(
            tokens=tokens,
            mask=mask,
            encoder_input=encoder_embed,
            encoder_mask=encoder_mask,
            input_pos=input_pos,
        )
```

To be:

```
        if encoder_input is not None:
            encoder_embed = self.encoder(**encoder_input)
        else:
            encoder_embed = torch.full_like(encoder_input, torch.nan)
        output = self.decoder(
            tokens=tokens,
            mask=mask,
            encoder_input=encoder_embed,
            encoder_mask=encoder_mask,
            input_pos=input_pos,
        )
```

Test Plan: Rely on unit tests

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/extension/llm/modules/attention.py b/extension/llm/modules/attention.py
@@ -246,7 +246,6 @@ def forward(
         # x has shape [b, s_x, d]
         # y has shape [b, s_y, d]
         b, s_x, _ = x.shape
-        s_y = y.shape[1] if y is not None else 0
 
         # q has shape [b, s_x, num_heads * head_dim]
         q = self.q_proj(x)
@@ -263,16 +262,9 @@ def forward(
         if self.q_norm is not None:
             q = self.q_norm(q)
 
-        if y is None:
-            if self.kv_cache is None:
-                raise ValueError(
-                    "Must provide y input or use kv_cache to enable streaming decoding"
-                )
-            k = self.kv_cache.k_cache
-            v = self.kv_cache.v_cache
-        else:
+        def calculate_kv(y):
             # Update k and v shape, positional embeddings, and normalization
-
+            s_y = y.shape[1]
             # k has shape [b, s_y, num_kv_heads * head_dim]
             # v has shape [b, s_y, num_kv_heads * head_dim]
             k = self.k_proj(y)
@@ -288,10 +280,35 @@ def forward(
             # Normalize k
             if self.k_norm is not None:
                 k = self.k_norm(k)
+            return k, v
+
+        def true_fn(y):
+            kv_cache = self.kv_cache.clone()
+            return kv_cache.k_cache, kv_cache.v_cache, kv_cache.cache_pos
+
+        def false_fn(y):
+            k, v = calculate_kv(y)
+            kv_cache = self.kv_cache.clone()
+            kv_cache.update(k, v)
+            return kv_cache.k_cache, kv_cache.v_cache, kv_cache.cache_pos
 
+        # If kv cache is None, we expect y to be provided
+        if self.kv_cache is None:
+            assert (
+                y is not None
+            ), "Must provide y input or use kv_cache to enable streaming decoding"
+            k, v = calculate_kv(y)
+        else:
+            # Expecting the k, v returning here to be the same size of self.kv_cache
+            # In eager, we expect this predicate to specialize. In export, this will
+            # become a SymBool so it's not specialized.
+            k, v, cache_pos = torch.cond(
+                torch.isnan(y).all().item(), true_fn, false_fn, (y,)
+            )
             # Update key-value cache
-            if self.kv_cache is not None and self.cache_enabled:
-                k, v = self.kv_cache.update(k, v)
+            self.kv_cache.k_cache.copy_(k)
+            self.kv_cache.v_cache.copy_(v)
+            self.kv_cache.cache_pos.copy_(cache_pos)
 
         output = self._sdpa(q, k, v, b, s_x)
         return self.output_proj(output)
diff --git a/extension/llm/modules/kv_cache.py b/extension/llm/modules/kv_cache.py
@@ -127,3 +127,23 @@ def update(
         self.cache_pos.add_(seq_len)
 
         return k_out, v_out
+
+    def clone(self) -> "KVCache":
+        """Create a clone of the KVCache."""
+        if self.transpose_cache:
+            max_seq_len = self.k_cache.shape[1]
+            num_kv_heads = self.k_cache.shape[2]
+        else:
+            max_seq_len = self.k_cache.shape[2]
+            num_kv_heads = self.k_cache.shape[1]
+        clone = KVCache(
+            batch_size=self.batch_size,
+            max_seq_len=max_seq_len,
+            num_kv_heads=num_kv_heads,
+            head_dim=self.k_cache.shape[3],
+            dtype=self.k_cache.dtype,
+        )
+        clone.k_cache.copy_(self.k_cache)
+        clone.v_cache.copy_(self.v_cache)
+        clone.cache_pos.copy_(self.cache_pos)
+        return clone
diff --git a/extension/llm/modules/test/test_attention.py b/extension/llm/modules/test/test_attention.py
@@ -41,9 +41,11 @@ def setUp(self):
         self.k_proj = torch.nn.Linear(
             self.embed_dim, self.num_kv_heads * self.head_dim, bias=False
         )
+        self.k_proj.weight.requires_grad = False
         self.v_proj = torch.nn.Linear(
             self.embed_dim, self.num_kv_heads * self.head_dim, bias=False
         )
+        self.v_proj.weight.requires_grad = False
         self.output_proj = torch.nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.pos_embeddings = Llama3ScaledRoPE(
             dim=self.head_dim,

Original file line number	Diff line number	Diff line change
`@@ -41,9 +41,11 @@ def setUp(self):`
`41`	`41`	`self.k_proj = torch.nn.Linear(`
`42`	`42`	`self.embed_dim, self.num_kv_heads * self.head_dim, bias=False`
`43`	`43`	`)`
	`44`	`+ self.k_proj.weight.requires_grad = False`
`44`	`45`	`self.v_proj = torch.nn.Linear(`
`45`	`46`	`self.embed_dim, self.num_kv_heads * self.head_dim, bias=False`
`46`	`47`	`)`
	`48`	`+ self.v_proj.weight.requires_grad = False`
`47`	`49`	`self.output_proj = torch.nn.Linear(self.embed_dim, self.embed_dim, bias=False)`
`48`	`50`	`self.pos_embeddings = Llama3ScaledRoPE(`
`49`	`51`	`dim=self.head_dim,`