Merge pull request #21298 from oliverdutton:pallas_interpreter_indexing_fix

jax authors · jax authors · commit 7d913f763a15 · 2024-06-07T12:29:31.000-07:00
PiperOrigin-RevId: 641325047
diff --git a/jax/_src/pallas/pallas_call.py b/jax/_src/pallas/pallas_call.py
@@ -36,6 +36,7 @@
 from jax._src.interpreters import partial_eval as pe
 from jax._src.interpreters import xla
 from jax._src.pallas import core as pallas_core
+from jax._src.pallas.primitives import uninitialized_value
 from jax._src.state import discharge as state_discharge
 from jax._src.state import primitives as sp
 from jax._src.util import (
@@ -110,17 +111,10 @@ def _pad_values_to_block_dimension(value,
   )
   if padded_shape != value.shape:
     pad_width = tuple((0, a-b) for a, b in zip(padded_shape, value.shape))
-    pad_value = _uninitialized_value(shape=(), dtype=value.dtype)
+    pad_value = uninitialized_value(shape=(), dtype=value.dtype)
     value = jnp.pad(value, pad_width, constant_values=pad_value)
   return value
 
-def _uninitialized_value(shape, dtype):
-  if jnp.issubdtype(dtype, jnp.floating):
-    return jnp.full(shape, jnp.nan, dtype)
-  elif jnp.issubdtype(dtype, jnp.integer):
-    return jnp.full(shape, jnp.iinfo(dtype).min, dtype)
-  raise NotImplementedError(dtype)
-
 def _get_next_indices(grid, indices):
   next_indices = []
   carry = True
@@ -176,7 +170,7 @@ def _pallas_call_impl(*args, jaxpr, name, out_shapes, which_linear,
         hasattr(a, "shape") and hasattr(a, "dtype") for a in scratch_avals
     ):
       raise NotImplementedError(f"Cannot initialize scratch: {scratch_avals}")
-    scratch_values = [_uninitialized_value(a.shape, a.dtype)
+    scratch_values = [uninitialized_value(a.shape, a.dtype)
                       for a in scratch_avals]
 
     carry = []
diff --git a/jax/_src/pallas/primitives.py b/jax/_src/pallas/primitives.py
@@ -298,6 +298,41 @@ def _load_jvp(primals, tangents, args_tree, **params):
 
 ad.primitive_jvps[load_p] = _load_jvp
 
+def uninitialized_value(shape, dtype):
+  if jnp.issubdtype(dtype, jnp.floating):
+    return jnp.full(shape, jnp.nan, dtype)
+  elif jnp.issubdtype(dtype, jnp.integer):
+    return jnp.full(shape, jnp.iinfo(dtype).min, dtype)
+  elif jnp.issubdtype(dtype, jnp.bool):
+    return jnp.full(shape, False, dtype)
+  raise NotImplementedError(dtype)
+
+def _pad_values_to_avoid_dynamic_slice_oob_shift(value,
+                                   slice_sizes, unpad=False):
+  """
+  DynamicSlice and DynamicUpdateSlice adjust the start index in cases where the
+  requested slice overruns the bounds of the array. This pads the array with
+  uninitialised values such that the requested slice will never overrun.
+
+  For example, if arr is [1.,2.,3.,4.] and a slice of size 4, start index 2 is
+  requested then the result will be [3.,4.,NaN,NaN] after padding, rather than
+  [1.,2.,3.,4.] from the unpadded array
+
+  unpad=True performs the inverse operation
+  """
+
+  padding_config = tuple((0, slice_size, 0) for slice_size in slice_sizes)
+  if unpad:
+    padding_config = tuple((-low, -high, -interior)
+                           for (low, high, interior) in padding_config)
+  padding_value = uninitialized_value(shape=(), dtype=value.dtype)
+  value = lax.pad(value,
+                  padding_config=padding_config,
+                  padding_value=padding_value)
+  return value
+
+_unpad_values_to_avoid_dynamic_slice_oob_shift = partial(
+  _pad_values_to_avoid_dynamic_slice_oob_shift, unpad=True)
 
 def _load_discharge_rule(in_avals, out_avals, *args_flat, args_tree, **_):
   del out_avals  # Unused.
@@ -315,6 +350,10 @@ def _load_discharge_rule(in_avals, out_avals, *args_flat, args_tree, **_):
     scalar_dims = [not isinstance(s, Slice) and not s.shape for s in indices]
     slice_starts = [s.start if isinstance(s, Slice) else s for s in indices]
     slice_sizes = tuple(s.size if isinstance(s, Slice) else 1 for s in indices)
+    # fixes an inconstency with lax.dynamic_slice where if the slice goes out
+    # of bounds, it will instead move the start_index backwards so the slice
+    # will fit in memory.
+    ref = _pad_values_to_avoid_dynamic_slice_oob_shift(ref, slice_sizes)
     out_ones = lax.dynamic_slice(ref, slice_starts, slice_sizes=slice_sizes)
     out_indexer = tuple(0 if scalar else slice(None) for scalar in scalar_dims)
     out = out_ones[out_indexer]
@@ -424,6 +463,10 @@ def _swap_discharge_rule(in_avals, out_avals, *args_flat, args_tree, **_):
     ]
     slice_starts = [s.start if isinstance(s, Slice) else s for s in indices]
     slice_sizes = tuple(s.size if isinstance(s, Slice) else 1 for s in indices)
+    # fixes an inconstency with lax.dynamic_update_slice where if the slice
+    # goes out of bounds, it will instead move the start_index backwards so the
+    # slice will fit in memory.
+    ref = _pad_values_to_avoid_dynamic_slice_oob_shift(ref, slice_sizes)
     out = lax.dynamic_slice(ref, slice_starts, slice_sizes=slice_sizes)
     out = jnp.squeeze(out, scalar_dims)
     if mask is not None:
@@ -432,6 +475,7 @@ def _swap_discharge_rule(in_avals, out_avals, *args_flat, args_tree, **_):
       val = jnp.where(mask, val, out_)
     val = jnp.expand_dims(val, scalar_dims)
     x_new = lax.dynamic_update_slice(ref, val, start_indices=slice_starts)
+    x_new = _unpad_values_to_avoid_dynamic_slice_oob_shift(x_new, slice_sizes)
   elif all(not isinstance(s, Slice) for s in idx.indices):
     out = ref[idx.indices]
     if mask is not None:
diff --git a/jax/experimental/jax2tf/tests/primitives_test.py b/jax/experimental/jax2tf/tests/primitives_test.py
@@ -182,6 +182,9 @@ def test_primitive_coverage(self):
       if p.name == "debug_callback" or p.name == "debug_print":
         # TODO(sharadmv,necula): enable debug callbacks in TF
         continue
+      if p.name in ("max_contiguous", "multiple_of"):
+        # Pallas-specific primitives are not supported.
+        continue
       if p.name == "pallas_call":
         continue
       if p.name in tf_not_yet_impl:
diff --git a/tests/pallas/pallas_test.py b/tests/pallas/pallas_test.py
@@ -1467,6 +1467,27 @@ def kernel(x_ref, o_ref):
     x = random.normal(key, (size,))
     np.testing.assert_allclose(kernel(x), x + 1.0, atol=1e-5, rtol=1e-5)
 
+  def test_masked_oob_load_store_slice(self):
+    n = 16
+
+    @functools.partial(
+        self.pallas_call,
+        out_shape=(jax.ShapeDtypeStruct((n,), jnp.float32)),
+        grid=1,
+    )
+    def masked_oob_load_store_slice(x_ref, mask_ref, start_idx_ref, o_ref):
+      x = pl.load(x_ref, (pl.dslice(start_idx_ref[()], n)),
+                  mask=mask_ref[:], other=-1.)
+      pl.store(o_ref, (pl.dslice(None),), x)
+
+    x = random.normal(random.key(0), (n,))
+    slice_start = random.randint(random.key(2), (), 1, n)
+    indices = jnp.arange(n) + slice_start
+    mask = indices < n
+    out = masked_oob_load_store_slice(x, mask, slice_start)
+    o_new = jnp.where(mask, x[indices], jnp.full_like(x, -1.))
+    np.testing.assert_array_equal(out, o_new)
+
   def test_strided_load(self):
     if self.INTERPRET:
       # TODO(b/329733289): Remove this once the bug is fixed.
@@ -1567,6 +1588,35 @@ def masked_swap(_, _2, mask_ref, x_ref, y_ref):
     np.testing.assert_array_equal(out[0], jnp.where(mask, y, x))
     np.testing.assert_array_equal(out[1], jnp.where(mask, x, y))
 
+  def test_masked_oob_swap_slice(self):
+    m, n = 32, 16
+
+    @functools.partial(
+        self.pallas_call,
+        out_shape=(jax.ShapeDtypeStruct((n,), jnp.float32),
+                   jax.ShapeDtypeStruct((m,), jnp.float32)),
+        grid=1,
+        input_output_aliases={0: 0, 1: 1},
+    )
+    def masked_oob_swap_slice(_, _2, mask_ref, start_idx_ref, x_ref, y_ref):
+      x, mask = x_ref[:], mask_ref[:]
+      y = pl.swap(y_ref, (pl.dslice(start_idx_ref[()], n)), x, mask=mask)
+      x_ref[:] = y
+
+    x = random.normal(random.key(0), (n,))
+    y = random.normal(random.key(1), (m,))
+    slice_start = random.randint(random.key(2), (), m-n+1, m)
+    indices = jnp.arange(n) + slice_start
+    mask = indices < m
+    out = masked_oob_swap_slice(x, y, mask, slice_start)
+
+    # the unjittable masked indexing equivalent
+    unmasked_idx = indices[mask]
+    x_new = x.at[mask].set(y[unmasked_idx])
+    y_new = y.at[unmasked_idx].set(x[mask])
+    np.testing.assert_array_equal(out[0], x_new)
+    np.testing.assert_array_equal(out[1], y_new)
+
   @parameterized.named_parameters(
       ("add_i32", pl.atomic_add, np.array([1, 2, 3, 4], np.int32), np.sum),
       ("max_i", pl.atomic_max, np.array([1, 2, 3, 4], np.int32), np.max),