Add page_pool argument (#1303)

hanzhi713 · web-flow · commit f8fd18673a7f · 2025-07-22T23:34:43.000Z
* Add page_pool argument

* Fix for rattention

* Add missing
diff --git a/axlearn/common/attention.py b/axlearn/common/attention.py
@@ -76,6 +76,11 @@
 * When the accompanying argument is `query`, the `positions` argument is named as
   `query_position`. Similarly, when the argument `target`, it is named as `target_positions`.
 
+On `page_pool`:
+* If not None, stores the external paged KV pool possibly shared by multiple
+  layers. Additionally, `cached_states` will not contain KV state. Instead, it will
+  contain indices used to index into `page_pool`.
+
 TODO(changlan): Merge the use of `positions` and `time_step` to reduce cognitive complexity.
 
 """
@@ -317,6 +322,7 @@ def extend_step(
         self_attention_logit_biases: Optional[Tensor] = None,
         cross_attention_data: Optional[Tensor] = None,
         cross_attention_logit_biases: Optional[Tensor] = None,
+        page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[NestedTensor, Output]:
         """Computes incremental outputs.
 
@@ -335,6 +341,7 @@ def extend_step(
             cross_attention_logit_biases: An optional Tensor of shape
                 [..., target_step_length, source_length], where `target_step_length` must match
                 the shape of `data`.
+            page_pool: See file-level comments on `page_pool`.
 
         Returns:
             (updated_cached_states, output), where:
@@ -1653,6 +1660,7 @@ def _forward_for_mode(
         query_positions: Optional[Tensor] = None,
         cached_states: Optional[NestedTensor] = None,
         return_aux: Optional[set[str]] = None,
+        page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Nested[Tensor], Optional[Output]]:
         """Computes attention for the given query, key, value, and attention logit biases.
 
@@ -1672,6 +1680,7 @@ def _forward_for_mode(
             query_positions: See ``On positions`` in the file comments.
             cached_states: Optional NestedTensor as produced by `init_states`.
             return_aux: See comments on `Output`.
+            page_pool: See file-level comments on `page_pool`.
 
         Returns:
             A tuple (cached_states, output):
@@ -1749,6 +1758,7 @@ def _forward_for_mode(
                         v_proj=v_proj,
                         key_positions=query_positions,
                         live_step_len=live_step_len,
+                        page_pool=page_pool,
                     )
                 if mode == ForwardMode.EXTEND_STEP:
                     kv_state = KVState(*kv_cache_output)
@@ -2042,6 +2052,7 @@ def extend_step(
         kv_state: Optional[KVState] = None,
         attention_logit_biases: Optional[Tensor] = None,
         return_aux: Optional[set[str]] = None,
+        page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[NestedTensor, Output]:
         """Computes the value vector given the query of the current step.
         This function is used by autoregressive decoding.
@@ -2068,6 +2079,7 @@ def extend_step(
                 The biases should already include causal masking for decoding, plus other biases
                 if necessary.
             return_aux: See comments on `Output`.
+            page_pool: See file-level comments on `page_pool`.
 
         Returns:
             A `NestedTensor` state of key and value pair along with index updated at `time_step`.
@@ -2083,6 +2095,7 @@ def extend_step(
             kv_state=kv_state,
             attention_logit_biases=attention_logit_biases,
             return_aux=return_aux,
+            page_pool=page_pool,
         )
 
     @staticmethod
@@ -2640,6 +2653,7 @@ def _forward_for_mode(
         target_positions: Optional[Tensor] = None,
         cached_states: Optional[NestedTensor] = None,
         return_aux: Optional[set[str]] = None,
+        page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Optional[Nested[Tensor]], Optional[Output]]:
         """Computes either self-attention or cross-attention for the given target and source.
 
@@ -2654,6 +2668,7 @@ def _forward_for_mode(
             target_positions: See ``On positions`` in the file comments.
             cached_states: Optional NestedTensor as produced by `init_states`.
             return_aux: See comments on `Output`.
+            page_pool: See file-level comments on `page_pool`.
 
         Returns:
             A tuple (cached_states, output):
@@ -2709,6 +2724,7 @@ def attention_thunk(target: Tensor) -> tuple[Optional[NestedTensor], Tensor]:
                     target,
                     **kv_kwargs,
                     attention_logit_biases=attention_logit_biases,
+                    page_pool=page_pool,
                 )
             else:
                 raise ValueError(f"Unrecognized mode {mode}.")
@@ -2841,6 +2857,7 @@ def extend_step(
         source: Optional[Union[Tensor, KVState]] = None,
         attention_logit_biases: Optional[Tensor] = None,
         return_aux: Optional[set[str]] = None,
+        page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Nested[Tensor], Output]:
         """Computes the value vector given the query of the current step.
         This function is used by autoregressive decoding.
@@ -2858,6 +2875,7 @@ def extend_step(
                 attention_logit_biases should have already taken care of causal masking for
                 decoding, plus other maskings necessary.
             return_aux: See comments on `Output`.
+            page_pool: See file-level comments on `page_pool`.
 
         Returns:
             A `NestedTensor` state of key and value pair along with index updated at `time_step`.
@@ -2874,6 +2892,7 @@ def extend_step(
             cached_states=cached_states,
             attention_logit_biases=attention_logit_biases,
             return_aux=return_aux,
+            page_pool=page_pool,
         )
 
 
@@ -3196,6 +3215,7 @@ def _forward_for_mode(
         target_positions: Optional[Tensor] = None,
         cached_states: Optional[NestedTensor] = None,
         return_aux: Optional[set[str]] = None,
+        page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Optional[NestedTensor], Optional[BaseTransformerLayer.Output]]:
         """Computes transformer layer outputs and self/cross-attention probabilities.
 
@@ -3212,6 +3232,7 @@ def _forward_for_mode(
             target_positions: See ``positions`` in the file comments.
             cached_states: Optional NestedTensor as produced by `init_states`.
             return_aux: See comments on BaseTransformerLayer.forward.
+            page_pool: See file-level comments on `page_pool`.
 
         Returns:
             A tuple (cached_states, output):
@@ -3273,6 +3294,7 @@ def _forward_for_mode(
                 source=self_attention_kv_state,
                 attention_logit_biases=self_attention_logit_biases,
                 return_aux=self_attention_return_aux,
+                page_pool=page_pool,
             )
         else:
             raise ValueError(f"Unrecognized mode {mode}.")
@@ -3979,6 +4001,7 @@ def _forward_for_mode(
                 assert value.shape[0] == cfg.num_layers, f"{path}={shapes(value)}"
 
         def layer_fn(carry, x_i):
+            x_i, page_pool = x_i
             if mode == ForwardMode.FORWARD:
                 layer_states, layer_outputs = None, self.layer(**carry, **layer_kwargs)
             elif mode == ForwardMode.INIT_STATES:
@@ -3990,7 +4013,10 @@ def layer_fn(carry, x_i):
             elif mode == ForwardMode.EXTEND_STEP:
                 assert x_i is not None
                 layer_states, layer_outputs = self.layer.extend_step(
-                    cached_states=x_i, **carry, **layer_kwargs
+                    cached_states=x_i,
+                    **carry,
+                    **layer_kwargs,
+                    page_pool=page_pool,
                 )
             else:
                 raise ValueError(f"Unrecognized mode {mode}.")
@@ -4005,6 +4031,7 @@ def layer_fn(carry, x_i):
                 return carry, ys
 
             ys.update({k: v for k, v in layer_outputs._asdict().items() if k not in carry})
+            ys["page_pool"] = page_pool
             return {k: getattr(layer_outputs, k) for k in carry}, ys
 
         if cfg.carry is None:
@@ -4013,10 +4040,16 @@ def layer_fn(carry, x_i):
             layer_kwargs["data"] = data
             carry = {k: layer_kwargs.pop(k) for k in cfg.carry}
 
-        repeat_outputs: Repeat.Output = self._run(layer_fn, carry=carry, xs=cached_states)
+        page_pool = layer_kwargs.pop("page_pool", None)
+        repeat_outputs: Repeat.Output = self._run(
+            layer_fn, carry=carry, xs=(cached_states, page_pool)
+        )
         carry = repeat_outputs.carry
         ys = repeat_outputs.ys
         updated_states = ys.pop("cached_states", None)
+        out_page_pool = ys.pop("page_pool", None)
+        if page_pool is not None and out_page_pool is not None:
+            page_pool[:] = out_page_pool  # type: ignore
 
         if cache_init:
             assert ys == {}
diff --git a/axlearn/common/decoder.py b/axlearn/common/decoder.py
@@ -496,6 +496,7 @@ def _forward_for_mode(
         cross_attention_data: Optional[Tensor] = None,
         cross_attention_logit_biases: Optional[Tensor] = None,
         cached_states: Optional[NestedTensor] = None,
+        page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Optional[NestedTensor], Tensor]:
         validate_contains_paths(input_batch, paths=["input_ids"])
         input_segment_ids = input_batch.get("input_segment_ids", None)
@@ -538,6 +539,7 @@ def _forward_for_mode(
                 self_attention_logit_biases=self_attention_logit_biases,
                 cross_attention_data=cross_attention_data,
                 cross_attention_logit_biases=cross_attention_logit_biases,
+                page_pool=page_pool,
             )
         else:
             raise ValueError(f"Unrecognized mode {mode}.")
diff --git a/axlearn/common/kv_cache/base_kv_cache.py b/axlearn/common/kv_cache/base_kv_cache.py
@@ -84,6 +84,7 @@ def extend_step(
         v_proj: Tensor,
         key_positions: Tensor,
         live_step_len: Optional[Tensor] = None,
+        page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Nested[Tensor], Output]:
         """Updates the KV cache per extend step.
 
@@ -96,8 +97,9 @@ def extend_step(
             k_proj: A Tensor of shape [batch, step_length, num_kv_heads, per_head_dim].
             v_proj: A Tensor of shape [batch, step_length, num_kv_heads, per_head_dim].
             key_positions: An optional Tensor of shape [1|batch, step_length].
-            live_step_len: An optional Tensor of shape [batch]. Please refer to ``On live_step_len``
-                in the file docstring for details.
+            live_step_len: An optional Tensor of shape [batch]. See file-level docstring of
+                `attention.py`
+            page_pool: See file-level docstring of `attention.py`.
 
         Returns:
             A tuple (updated_state, output):
diff --git a/axlearn/common/kv_cache/kv_cache.py b/axlearn/common/kv_cache/kv_cache.py
@@ -38,11 +38,13 @@ def extend_step(
         v_proj: Tensor,
         key_positions: Tensor,
         live_step_len: Optional[Tensor] = None,
+        page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Nested[Tensor], BaseKVCache.Output]:
         # TODO(dhwang2): By returning only the valid portions of the KV (by live_step_len),
         # the attention complexity can be reduced from O(max_len²) to O(live_step_len²), especially
         # in prefill.
         # The remaining part after `live_step_len` is considered padding.
+        assert page_pool is None
         del live_step_len
         if k_proj.shape != v_proj.shape:
             raise ValueError(f"{k_proj.shape=} != {v_proj.shape=}")
diff --git a/axlearn/common/kv_cache/paged_kv_cache.py b/axlearn/common/kv_cache/paged_kv_cache.py
@@ -133,6 +133,7 @@ def extend_step(
         v_proj: Tensor,
         key_positions: Tensor,
         live_step_len: Optional[Tensor] = None,
+        page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Nested[Tensor], KVCache.Output]:
         """Extend the cache with the new key and value.
 
@@ -166,6 +167,7 @@ def extend_step(
             raise ValueError(f"{k_proj.shape[1]=} != {key_positions.shape[1]=}")
 
         if "page_indices" not in cached_states:
+            assert page_pool is None
             # Prefill, return kv cache directly
             cached_states["key"] = k_proj
             cached_states["value"] = v_proj
@@ -177,8 +179,21 @@ def extend_step(
 
         # kv_pages shape: [num_heads, max_pages_global, page_size, head_dim]. Also refer to
         # https://github.com/jax-ml/jax/blob/main/jax/experimental/pallas/ops/tpu/paged_attention/paged_attention_kernel.py#L388
-        k_pages: Tensor = cached_states["key"]
-        v_pages: Tensor = cached_states["value"]
+        if page_pool is not None:
+            # We use `group_info` to index into `page_pool` to get the paged KV pool for this
+            # layer.
+            group_info = cached_states["group_info"]
+            # HACK(hanzhi-zhou): we store the indices as dict keys to workaround them being
+            # converted to tracers.
+            group_idx = list(group_info["group_idx"].keys())[0]
+            repeat_idx = list(group_info["repeat_idx"].keys())[0]
+            pool = page_pool[group_idx][repeat_idx]
+            k_pages: Tensor = pool.k_pages  # type: ignore
+            v_pages: Tensor = pool.v_pages  # type: ignore
+        else:
+            k_pages: Tensor = cached_states["key"]
+            v_pages: Tensor = cached_states["value"]
+
         assert k_pages.shape == v_pages.shape
 
         batch = page_indices.shape[0]
@@ -220,11 +235,18 @@ def update_kv_pages(kv_pages, page_indices, key_positions, kv_proj):
             v_pages, page_indices, key_positions, v_proj.astype(v_pages.dtype)
         )
 
-        updated_state = dict(
-            key=updated_k_pages,
-            value=updated_v_pages,
-            page_indices=page_indices,
-        )
+        if page_pool is not None:
+            page_pool[group_idx][repeat_idx] = type(pool)(updated_k_pages, updated_v_pages)
+
+            # Updates are already performed through mutable arrays above. We don't perform state
+            # updates through `updated_state`.
+            updated_state = dict(key=None, value=None, page_indices=None)
+        else:
+            updated_state = dict(
+                key=updated_k_pages,
+                value=updated_v_pages,
+                page_indices=page_indices,
+            )
 
         assert updated_k_pages.shape == k_pages.shape
         assert updated_v_pages.shape == v_pages.shape
diff --git a/axlearn/common/kv_cache/sliding_window_kv_cache.py b/axlearn/common/kv_cache/sliding_window_kv_cache.py
@@ -53,6 +53,7 @@ def extend_step(
         v_proj: Tensor,
         key_positions: Tensor,
         live_step_len: Optional[Tensor] = None,
+        page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Nested[Tensor], BaseKVCache.Output]:
         """Updates the sliding window KV cache per extend step.
 
@@ -70,6 +71,7 @@ def extend_step(
             * output: The output k_proj, v_proj, and key_positions, which are merged with
                 KV cache, resulting in a length of `cached_kv_length + step_size`.
         """
+        assert page_pool is None
         cfg = self.config
         cached_key: Tensor = cached_states["key"]
         cached_value: Tensor = cached_states["value"]
diff --git a/axlearn/common/module.py b/axlearn/common/module.py
@@ -1002,6 +1002,8 @@ class _Functional:
     context: InvocationContext = struct.field(pytree_node=True)
     # Whether to require that context.parent is current_context().
     require_parent: bool = struct.field(pytree_node=False)
+    # Whether to copy the argument pytrees to prevent method_fn from mutating the original.
+    copy_args_tree: bool = struct.field(pytree_node=False, default=True)
 
     def __call__(self, *args, **kwargs) -> tuple[Any, OutputCollection]:
         """Invokes method_fn in a pure functional fashion.
@@ -1024,13 +1026,14 @@ def __call__(self, *args, **kwargs) -> tuple[Any, OutputCollection]:
         call = getattr(self.method_fn, "__qualname__", None) or getattr(self.method_fn, "__name__")
         logging.vlog(1, "functional: %s.%s (*%s, **%s)", call, self.method_fn, args, kwargs)
 
-        # Copy to prevent method_fn from mutating the original.
         # Some badly behaved tests call F() with an InvocationContext.state that contains
         # circular references.
         # This results in a cryptic error that doesn't make the root cause obvious.
         # So we raise a clearer error explicitly.
         raise_for_cycles(dict(context=self.context, args=args, kwargs=kwargs))
-        context, args, kwargs = jax.tree.map(lambda x: x, (self.context, args, kwargs))
+        context = self.context
+        if self.copy_args_tree:
+            context, args, kwargs = jax.tree.map(lambda x: x, (self.context, args, kwargs))
 
         with set_current_context(context, require_parent=self.require_parent):
             # pylint: disable-next=not-an-iterable,not-a-mapping,not-callable
@@ -1047,6 +1050,7 @@ def functional(
     method: str = "forward",
     is_training: bool,
     drop_output_collections: Sequence[str] = ("module_outputs",),
+    copy_args_tree: bool = True,
 ) -> tuple[Any, OutputCollection]:
     """Invokes <module>.<method> in a pure functional fashion.
 
@@ -1065,6 +1069,8 @@ def functional(
         is_training: Whether the invocation should run in the training mode.
         drop_output_collections: The output collection types to drop.
             Defaults to dropping all module outputs.
+        copy_args_tree: Whether to copy the `inputs` pytree to prevent method_fn from mutating the
+            original. Defaults to True.
 
     Returns:
         (method_outputs, output_collection), where
@@ -1092,7 +1098,9 @@ def functional(
         args = inputs
     method_fn = getattr(module, method)
 
-    fn = _Functional(context=context, method_fn=method_fn, require_parent=True)
+    fn = _Functional(
+        context=context, method_fn=method_fn, require_parent=True, copy_args_tree=copy_args_tree
+    )
     method_outputs, output_collection = fn(*args, **kwargs)
 
     for output_collection_type in drop_output_collections:
diff --git a/axlearn/common/multiway_transformer.py b/axlearn/common/multiway_transformer.py
diff --git a/axlearn/common/rattention/rattention.py b/axlearn/common/rattention/rattention.py