Add Pallas Philox implementation.

justinjfu · Google-ML-Automation · commit 7e96914e615d · 2024-12-17T15:37:08.000-08:00
Implemented in the same style as the threefry kernel. Philox is roughly 2x faster than the existing JAX Threefry implementation in both runtime and compile time.

PiperOrigin-RevId: 707276043
diff --git a/jax/experimental/pallas/ops/tpu/random/philox.py b/jax/experimental/pallas/ops/tpu/random/philox.py
@@ -0,0 +1,208 @@
+# Copyright 2024 The JAX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implementation of the Philox PRNG as a Pallas kernel."""
+from typing import Sequence
+import jax
+from jax import typing
+from jax._src import prng
+from jax.experimental import pallas as pl
+from jax.experimental.pallas import tpu as pltpu
+import jax.numpy as jnp
+import numpy as np
+from jax.experimental.pallas.ops.tpu.random import prng_utils
+
+Shape = Sequence[int]
+
+BLOCK_SIZE = (256, 256)
+
+# Philox constants. See original paper at:
+# "Parallel Random Numbers: As Easy as 1, 2, 3", Salmon et. al. 2011
+K_HI_32 = 0x9E3779B9
+K_LO_32 = 0xBB67AE85
+MUL_A = 0xCD9E8D57
+MUL_B = 0xD2511F53
+
+
+def mul32_hi_lo(x: jax.Array, y: jax.Array) -> tuple[jax.Array, jax.Array]:
+  """Multiplies 2 32-bit values and returns the hi+low bits of the result."""
+  xhi = x >> 16
+  yhi = y >> 16
+  xlo = x & 0xffff
+  ylo = y & 0xffff
+
+  xy_hi = xhi * yhi
+  xy_lo = xlo * ylo
+  cross_xy = xhi * ylo
+  cross_yx = xlo * yhi
+  carry = (cross_xy & 0xffff) + (cross_yx & 0xffff) + (xy_lo >> 16)
+  return xy_hi + (cross_xy >> 16) + (cross_yx >> 16) + (carry >> 16), xy_lo
+
+
+def philox_4x32(hi0, lo0, hi1, lo1, k_hi, k_lo, rounds = 10):
+  """Philox 4x32 keyed hash function."""
+  k_hi_const = jnp.array(K_HI_32, dtype=jnp.uint32)
+  k_lo_const = jnp.array(K_LO_32, dtype=jnp.uint32)
+  mul_a = jnp.array(MUL_A, dtype=jnp.uint32)
+  mul_b = jnp.array(MUL_B, dtype=jnp.uint32)
+
+  for i in range(rounds):
+    # Compute the round.
+    new_hi0, new_lo0 = mul32_hi_lo(mul_a, hi1)
+    new_hi0 = new_hi0 ^ lo0 ^ k_hi
+    new_hi1, new_lo1 = mul32_hi_lo(mul_b, hi0)
+    new_hi1 = new_hi1 ^ lo1 ^ k_lo
+    hi0, lo0, hi1, lo1 = new_hi0, new_lo0, new_hi1, new_lo1
+
+    # Raise the key on all iterations except for the last round.
+    if i != rounds - 1:
+      k_hi = k_hi + k_hi_const
+      k_lo = k_lo + k_lo_const
+  return hi0, lo0, hi1, lo1
+
+
+def philox_4x32_kernel(key,
+                      shape: Shape,
+                      unpadded_shape: Shape,
+                      block_size: tuple[int, int],
+                      offset: typing.ArrayLike = 0,
+                      fuse_output: bool = True):
+  """Generates random bits using the Philox keyed hash function.
+
+  Args:
+    key: A Philox key of shape (2,).
+    shape: The shape of the output. Must be divisible by `block_size`.
+    unpadded_shape: If `shape` is padded, then this is the shape of the
+      output tensor if it were not padded. This is important for indexing
+      calculations within the kernel. If `shape` is not padded, then this
+      should be equal to `shape`.
+    block_size: The block size of the kernel.
+    offset: An optional offset to the counts.
+    fuse_output: Whether to fuse the output bits into a single value.
+
+  Returns:
+    A tensor of random bits of shape `shape` if fuse_output=True. Otherwise,
+    this will return a tensor of shape (2, *shape) with the first channel being
+    the high bits and the second channel being the low bits.
+  """
+  shape = tuple(shape)
+  if np.prod(shape) > jnp.iinfo(jnp.uint32).max:
+    raise ValueError(
+        f"Shape too large: {np.prod(shape)} > {np.iinfo(jnp.uint32).max}")
+
+  if (shape[-2] % block_size[-2] != 0) or (shape[-1] % block_size[-1] != 0):
+    raise ValueError(
+        f"Shape dimension {shape[-2:]} must be divisible by {block_size}")
+  grid_dims = shape[:-2] + (
+      shape[-2] // block_size[-2], shape[-1] // block_size[1],)
+  offset = jnp.array(offset, dtype=jnp.uint32)
+  if offset.ndim != 0:
+    raise ValueError(f"Offset must be scalar, got {offset.shape}")
+  offset = jnp.reshape(offset, (1,))
+
+  def kernel(offset_ref, key_ref, out_ref):
+    counts_idx = tuple(pl.program_id(i) for i in range(len(grid_dims)))
+    offset = prng_utils.compute_scalar_offset(
+        counts_idx, unpadded_shape, block_shape)
+    counts_lo = prng_utils.blocked_iota(block_size, unpadded_shape)
+    counts_lo = counts_lo + offset + offset_ref[0]
+    counts_lo = counts_lo.astype(jnp.uint32)
+    # TODO(justinfu): Support hi bits on count.
+    _zeros = jnp.zeros_like(counts_lo)
+    k1 = jnp.reshape(key_ref[0, 0], (1, 1))
+    k2 = jnp.reshape(key_ref[0, 1], (1, 1))
+    o1, o2, _, _ = philox_4x32(_zeros, counts_lo, _zeros, _zeros, k1, k2)
+    if fuse_output:
+      out_bits = o1 ^ o2
+      out_ref[...] = out_bits.reshape(out_ref.shape)
+    else:
+      out_ref[0, ...] = o1.reshape(out_ref[0].shape)
+      out_ref[1, ...] = o2.reshape(out_ref[0].shape)
+
+  key = key.reshape((1, 2))
+  block_shape = (1,) * (len(shape)-2) + block_size
+  if fuse_output:
+    out = jax.ShapeDtypeStruct(shape, dtype=jnp.uint32)
+    out_spec = pl.BlockSpec(block_shape, lambda *idxs: idxs)
+  else:
+    out = jax.ShapeDtypeStruct((2,) + shape, dtype=jnp.uint32)
+    out_spec = pl.BlockSpec((2,) + block_shape, lambda *idxs: (0, *idxs))
+  return pl.pallas_call(
+      kernel,
+      in_specs=[
+          pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.SMEM),
+          pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.SMEM),
+      ],
+      out_specs=out_spec,
+      grid=grid_dims,
+      out_shape=out,
+  )(offset, key)
+
+
+def philox_4x32_count(key,
+                      shape: Shape,
+                      offset: typing.ArrayLike = 0,
+                      fuse_output: bool = True):
+  """Convenience function to call philox_4x32_kernel with padded shapes."""
+  if len(shape) == 0:
+    return philox_4x32_count(
+        key, (1, 1), offset=offset, fuse_output=fuse_output)[..., 0, 0]
+  elif len(shape) == 1:
+    return philox_4x32_count(
+        key, (1, *shape), offset=offset, fuse_output=fuse_output)[..., 0, :]
+
+  requires_pad = (
+      shape[-2] % BLOCK_SIZE[-2] != 0) or (shape[-1] % BLOCK_SIZE[-1] != 0)
+  if requires_pad:
+    padded_shape = tuple(shape[:-2]) + (
+        prng_utils.round_up(shape[-2], BLOCK_SIZE[-2]),
+        prng_utils.round_up(shape[-1], BLOCK_SIZE[-1]),
+    )
+    padded_result = philox_4x32_kernel(
+        key, padded_shape, shape,
+        block_size=BLOCK_SIZE, offset=offset,
+        fuse_output=fuse_output)
+    return padded_result[..., :shape[-2], :shape[-1]]
+  else:
+    return philox_4x32_kernel(key, shape, shape,
+                              block_size=BLOCK_SIZE, offset=offset,
+                              fuse_output=fuse_output)
+
+
+def philox_split(key, shape: Shape):
+  """Splits the key into two keys of the same shape."""
+  bits1, bits2 = philox_4x32_count(key, shape, fuse_output=False)
+  return jnp.stack([bits1, bits2], axis=bits1.ndim)
+
+
+def philox_random_bits(key, bit_width: int, shape: Shape):
+  if bit_width != 32:
+    raise ValueError("Only 32-bit PRNG supported.")
+  return philox_4x32_count(key, shape, fuse_output=True)
+
+
+def philox_fold_in(key, data):
+  assert data.ndim == 0
+  return philox_4x32_count(key, (), offset=data, fuse_output=False)
+
+
+plphilox_prng_impl = prng.PRNGImpl(
+    key_shape=(2,),
+    seed=prng.threefry_seed,
+    split=philox_split,
+    random_bits=philox_random_bits,
+    fold_in=philox_fold_in,
+    name="pallas_philox4x32",
+    tag="pllox")
+
+prng.register_prng(plphilox_prng_impl)
diff --git a/jax/experimental/pallas/ops/tpu/random/prng_utils.py b/jax/experimental/pallas/ops/tpu/random/prng_utils.py
@@ -0,0 +1,55 @@
+# Copyright 2024 The JAX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper functions for PRNG kernels."""
+from typing import Sequence
+from jax import lax
+import jax.numpy as jnp
+
+Shape = Sequence[int]
+
+round_up = lambda x, y: (x + y - 1) // y * y
+
+def blocked_iota(block_shape: Shape,
+                 total_shape: Shape):
+  """Computes a sub-block of a larger shaped iota.
+
+  Args:
+    block_shape: The output block shape of the iota.
+    total_shape: The total shape of the input tensor.
+  Returns:
+    Result of the blocked iota.
+  """
+  iota_data = jnp.zeros(block_shape, dtype=jnp.uint32)
+  multiplier = 1
+  for dim in range(len(block_shape)-1, -1, -1):
+    block_mult = 1
+    counts_lo = lax.broadcasted_iota(
+        dtype=jnp.uint32, shape=block_shape, dimension=dim
+    )
+    iota_data += counts_lo * multiplier * block_mult
+    multiplier *= total_shape[dim]
+  return iota_data
+
+
+def compute_scalar_offset(iteration_index,
+                          total_size: Shape,
+                          block_size: Shape):
+  ndims = len(iteration_index)
+  dim_size = 1
+  total_idx = 0
+  for i in range(ndims-1, -1, -1):
+    dim_idx = iteration_index[i] * block_size[i]
+    total_idx += dim_idx * dim_size
+    dim_size *= total_size[i]
+  return total_idx
diff --git a/jax/experimental/pallas/ops/tpu/random/threefry.py b/jax/experimental/pallas/ops/tpu/random/threefry.py
@@ -14,54 +14,17 @@
 """Implementation of the Threefry PRNG as a Pallas kernel."""
 from typing import Sequence
 import jax
-from jax import lax
 from jax._src import prng
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
 import jax.numpy as jnp
 import numpy as np
+from jax.experimental.pallas.ops.tpu.random import prng_utils
 
 Shape = Sequence[int]
 
 BLOCK_SIZE = (256, 256)
 
-_round_up = lambda x, y: (x + y - 1) // y * y
-
-
-def blocked_iota(block_shape: Shape,
-                 total_shape: Shape):
-  """Computes a sub-block of a larger shaped iota.
-
-  Args:
-    block_shape: The output block shape of the iota.
-    total_shape: The total shape of the input tensor.
-  Returns:
-    Result of the blocked iota.
-  """
-  iota_data = jnp.zeros(block_shape, dtype=jnp.uint32)
-  multiplier = 1
-  for dim in range(len(block_shape)-1, -1, -1):
-    block_mult = 1
-    counts_lo = lax.broadcasted_iota(
-        dtype=jnp.uint32, shape=block_shape, dimension=dim
-    )
-    iota_data += counts_lo * multiplier * block_mult
-    multiplier *= total_shape[dim]
-  return iota_data
-
-
-def _compute_scalar_offset(iteration_index,
-                           total_size: Shape,
-                           block_size: Shape):
-  ndims = len(iteration_index)
-  dim_size = 1
-  total_idx = 0
-  for i in range(ndims-1, -1, -1):
-    dim_idx = iteration_index[i] * block_size[i]
-    total_idx += dim_idx * dim_size
-    dim_size *= total_size[i]
-  return total_idx
-
 
 def threefry_2x32_count(key,
                  shape: Shape,
@@ -97,8 +60,9 @@ def threefry_2x32_count(key,
 
   def kernel(key_ref, out_ref):
     counts_idx = tuple(pl.program_id(i) for i in range(len(grid_dims)))
-    offset = _compute_scalar_offset(counts_idx, unpadded_shape, block_shape)
-    counts_lo = blocked_iota(block_size, unpadded_shape)
+    offset = prng_utils.compute_scalar_offset(
+        counts_idx, unpadded_shape, block_shape)
+    counts_lo = prng_utils.blocked_iota(block_size, unpadded_shape)
     counts_lo = counts_lo + offset
     counts_lo = counts_lo.astype(jnp.uint32)
     # TODO(justinfu): Support hi bits on count.
@@ -134,8 +98,8 @@ def plthreefry_random_bits(key, bit_width: int, shape: Shape):
       shape[-2] % BLOCK_SIZE[-2] != 0) or (shape[-1] % BLOCK_SIZE[-1] != 0)
   if requires_pad:
     padded_shape = tuple(shape[:-2]) + (
-        _round_up(shape[-2], BLOCK_SIZE[-2]),
-        _round_up(shape[-1], BLOCK_SIZE[-1]),
+        prng_utils.round_up(shape[-2], BLOCK_SIZE[-2]),
+        prng_utils.round_up(shape[-1], BLOCK_SIZE[-1]),
     )
     padded_result = threefry_2x32_count(
         key, padded_shape, shape, block_size=BLOCK_SIZE)
diff --git a/tests/pallas/tpu_pallas_random_test.py b/tests/pallas/tpu_pallas_random_test.py