ROCm
diff --git a/‎.bazelrc‎
Lines changed: 1 addition & 1 deletion b/‎.bazelrc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/cloud-tpu-ci-nightly.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cloud-tpu-ci-nightly.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build/build.py‎
Lines changed: 2 additions & 0 deletions b/‎build/build.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/pallas/grid_blockspec.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/pallas/grid_blockspec.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎jax/_src/api.py‎
Lines changed: 1 addition & 2 deletions b/‎jax/_src/api.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎jax/_src/api_util.py‎
Lines changed: 23 additions & 6 deletions b/‎jax/_src/api_util.py‎
Lines changed: 23 additions & 6 deletions
diff --git a/‎jax/_src/callback.py‎
Lines changed: 15 additions & 7 deletions b/‎jax/_src/callback.py‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎jax/_src/checkify.py‎
Lines changed: 20 additions & 14 deletions b/‎jax/_src/checkify.py‎
Lines changed: 20 additions & 14 deletions
diff --git a/‎jax/_src/core.py‎
Lines changed: 2 additions & 1 deletion b/‎jax/_src/core.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎jax/_src/custom_batching.py‎
Lines changed: 19 additions & 9 deletions b/‎jax/_src/custom_batching.py‎
Lines changed: 19 additions & 9 deletions
@@ -233,7 +233,7 @@ build:ci_linux_aarch64_cuda --config=cuda --config=build_cuda_with_nvcc
 build:ci_linux_aarch64_cuda --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-18/bin/clang"
 
 # Mac x86 CI configs
-build:ci_darwin_x86_64 --macos_minimum_os=10.14
+build:ci_darwin_x86_64 --macos_minimum_os=11.0
 build:ci_darwin_x86_64 --config=macos_cache_push
 build:ci_darwin_x86_64 --verbose_failures=true
 build:ci_darwin_x86_64 --color=yes
 
@@ -33,7 +33,7 @@ jobs:
         python-version: ["3.10"]
     name: "TPU test (jaxlib=${{ matrix.jaxlib-version }}, ${{ matrix.tpu.type }})"
     env:
-      LIBTPU_OLDEST_VERSION_DATE: 20240922
+      LIBTPU_OLDEST_VERSION_DATE: 20241118
       PYTHON: python${{ matrix.python-version }}
     runs-on: ${{ matrix.tpu.runner }}
     container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest"
 
@@ -588,6 +588,8 @@ async def main():
     )
     for option in args.bazel_options:
       wheel_build_command_base.append(option)
+    if "cuda" in args.wheels:
+      wheel_build_command_base.append("--config=cuda_libraries_from_stubs")
 
   with open(".jax_configure.bazelrc", "w") as f:
     jax_configure_options = utils.get_jax_configure_bazel_options(wheel_build_command_base.get_command_as_list())
 
@@ -88,7 +88,7 @@ If the block shape does not divide evenly the overall shape then the
 last iteration on each axis will still receive references to blocks
 of `block_shape` but the elements that are out-of-bounds are padded
 on input and discarded on output. The values of the padding are unspecified, and
-you should assume they is garbage. In the `interpret=True` mode, we
+you should assume they are garbage. In the `interpret=True` mode, we
 pad with NaN for floating-point values, to give users a chance to
 spot accessing out-of-bounds elements, but this behavior should not
 be depended upon. Note that at least one of the
 
@@ -2060,10 +2060,9 @@ def linear_transpose(fun: Callable, *primals, reduce_axes=()) -> Callable:
     shape/dtypes/structure as ``primals``.
 
   >>> import jax
-  >>> import types
   >>>
   >>> f = lambda x, y: 0.5 * x - 0.5 * y
-  >>> scalar = types.SimpleNamespace(shape=(), dtype=np.dtype(np.float32))
+  >>> scalar = jax.ShapeDtypeStruct(shape=(), dtype=np.dtype(np.float32))
   >>> f_transpose = jax.linear_transpose(f, scalar, scalar)
   >>> f_transpose(1.0)
   (Array(0.5, dtype=float32), Array(-0.5, dtype=float32))
 
@@ -27,7 +27,7 @@
 from jax._src.state.types import AbstractRef
 from jax._src.tree_util import (
     PyTreeDef, tree_flatten, tree_unflatten, tree_map,
-    treedef_children, generate_key_paths, keystr, broadcast_prefix,
+    treedef_children, generate_key_paths, broadcast_prefix,
     prefix_errors)
 from jax._src.tree_util import _replace_nones
 from jax._src import linear_util as lu
@@ -595,6 +595,15 @@ def debug_info(
     sourceinfo: str | None = None,
     signature: inspect.Signature | None = None,
 ) -> core.DebugInfo:
+  """Constructd core.DebugInfo for a function given example args and kwargs.
+
+  `args` and `kwargs` are example positional and keyword arguments, users with
+  `inspect.Signature` to get the names of argments. The arguments that are
+  considered static for tracing purposes should be included, and designated
+  using `static_argnums` and `static_argnames`.
+
+  See docstring for linear_util.DebugInfo.
+  """
   if sourceinfo is None:
     sourceinfo = fun_sourceinfo(fun)
   if signature is None:
@@ -610,10 +619,17 @@ def fun_signature(fun: Callable) -> inspect.Signature | None:
   except (ValueError, TypeError):
     return None
 
-def save_wrapped_fun_sourceinfo(wrapper: Callable, wrapped: Callable):
+def save_wrapped_fun_sourceinfo(wrapper: Callable,
+                                wrapped: Callable | core.DebugInfo | None) -> None:
   # Prefer this to functools.wraps because it does not create a reference to
   # the wrapped function.
-  setattr(wrapper, "__fun_sourceinfo__", fun_sourceinfo(wrapped))
+  if isinstance(wrapped, core.DebugInfo):
+    func_src_info = wrapped.func_src_info
+  elif callable(wrapped):
+    func_src_info = fun_sourceinfo(wrapped)
+  else:
+    return
+  setattr(wrapper, "__fun_sourceinfo__", func_src_info)
 
 _fun_name_re = re.compile(r"(?:<built-in function (\S+)>)")
 
@@ -664,12 +680,13 @@ def _non_static_arg_names(fn_signature: inspect.Signature | None,
     except (ValueError, TypeError):
       pass
     else:
-      return tuple(f'{name}{keystr(path)}' for name, x in ba.arguments.items()
+      return tuple(f'{name}{lu._clean_keystr_arg_names(path)}'
+                   for name, x in ba.arguments.items()
                    for path, l in generate_key_paths(x) if l is not static)
-  args_arg_names = tuple(f'args{keystr(path)}'
+  args_arg_names = tuple(f'args{lu._clean_keystr_arg_names(path)}'
                          for path, l in generate_key_paths(args_)
                          if l is not static)
-  kwargs_arg_names = tuple(f'kwargs{keystr(path)}'
+  kwargs_arg_names = tuple(f'kwargs{lu._clean_keystr_arg_names(path)}'
                            for path, l in generate_key_paths(kwargs_)
                            if l is not static)
   arg_names = args_arg_names + kwargs_arg_names
 
@@ -35,6 +35,7 @@
 from jax._src.interpreters import mlir
 from jax._src.lax import lax
 from jax._src.lax.control_flow.loops import map as lax_map
+from jax._src.lax.control_flow.loops import scan
 from jax._src.lib import xla_client as xc
 from jax._src.sharding_impls import SingleDeviceSharding
 from jax._src.typing import DeprecatedArg
@@ -163,7 +164,10 @@ def callback_batching_rule(
 
   # For FFI calls we must update the layouts. We handle the output layouts
   # here, but the input layout updates depend on the vmap_method parameter.
-  if vmap_method != "sequential" and kwargs.get("output_layouts") is not None:
+  if (
+      vmap_method not in ("sequential", "sequential_unrolled") and
+      kwargs.get("output_layouts") is not None
+  ):
     kwargs["output_layouts"] = tuple(
         None if layout is None else tuple(n + 1 for n in layout) + (0,)
         for layout in kwargs["output_layouts"])
@@ -199,7 +203,7 @@ def callback_batching_rule(
       result_avals=batched_result_avals,
       **kwargs,
     )
-  elif vmap_method == "sequential":
+  elif vmap_method == "sequential" or vmap_method == "sequential_unrolled":
     is_batched = [d is not batching.not_mapped for d in dims]
     unbatched_args, batched_args = util.partition_list(is_batched, new_args)
     def _batch_fun(batched_args):
@@ -211,12 +215,14 @@ def _batch_fun(batched_args):
           vmap_method=vmap_method,
           **kwargs,
       )
-    outvals = lax_map(_batch_fun, batched_args)
+    unroll = vmap_method == "sequential_unrolled"
+    g = lambda _, x: ((), _batch_fun(x))
+    _, outvals = scan(g, (), batched_args, unroll=unroll)
   else:
     raise NotImplementedError(
         f"vmap is only supported for the {prim.name} primitive when vmap_method "
-        "is one of 'sequential', 'expand_dims', 'broadcast_all', or "
-        "'legacy_vectorized'.")
+        "is one of 'sequential', 'sequential_unrolled', 'expand_dims', "
+        f"'broadcast_all', or 'legacy_vectorized'. Got {vmap_method=}.")
   return tuple(outvals), (0,) * len(outvals)
 
 
@@ -371,6 +377,8 @@ def pure_callback(
     is deprecated and it will eventually raise ``NotImplementedError``.
   * ``vmap_method="sequential"`` uses :func:`~jax.lax.map` to loop over
     the batched arguments, calling ``callback`` once for each batch element.
+  * ``vmap_method="sequential_unrolled"`` is like ``sequential``, but the loop
+    is unrolled.
   * ``vmap_method="expand_dims"`` calls ``callback`` with new axes of size ``1``
     added as the leading dimension unbatched inputs.
   * ``vmap_method="broadcast_all"`` behaves like ``expand_dims``, but the
@@ -459,8 +467,8 @@ def pure_callback(
           "the vectorized and vmap_method arguments of jax.pure_callback cannot "
           "be used together. Please use the vmap_method argument.")
     vmap_method = "legacy_vectorized" if vectorized else "sequential"
-  allowed_vmap_methods = ["sequential", "expand_dims", "broadcast_all",
-                          "legacy_vectorized", None]
+  allowed_vmap_methods = ["sequential", "sequential_unrolled", "expand_dims",
+                          "broadcast_all", "legacy_vectorized", None]
   if vmap_method not in allowed_vmap_methods:
     raise ValueError(
         f"vmap_method must be on of the allowed methods {allowed_vmap_methods}, "
 
@@ -833,7 +833,7 @@ def new_body_f(*c_consts_and_vals):
     # This checks if the next cond application will error
     _ = cond_f(*c_consts, *out)
     return out
-  new_body_f_ = lu.wrap_init(new_body_f)
+  new_body_f_ = lu.wrap_init(new_body_f, debug_info=body_jaxpr.jaxpr.debug_info)
   c_consts_avals = cond_jaxpr.in_avals[:c_consts_num]
   jaxpr, _, (), () = pe.trace_to_jaxpr_dynamic(new_body_f_, [*c_consts_avals,
                                                              *body_jaxpr.in_avals])
@@ -952,7 +952,8 @@ def remat_error_check(error, enabled_errors, *vals_in, jaxpr, **params):
 
 
 def shard_map_error_check(
-    error, enabled_errors, *vals_in, jaxpr, in_names, out_names, **kwargs
+    error: Error, enabled_errors, *vals_in,
+    jaxpr: core.Jaxpr, in_names, out_names, **kwargs
 ):
   if (mesh := kwargs.get('mesh')) is None:
     raise ValueError('Mesh must be provided for shard_map with checkify.')
@@ -976,7 +977,6 @@ def shard_map_error_check(
     )
   num_out_error_vals = out_tree.num_leaves - len(out_names)
 
-  @lu.wrap_init
   def expand_errors_leading_dim(*xs):
     outs = core.eval_jaxpr(checked_jaxpr.jaxpr, checked_jaxpr.consts, *xs)
     errs, outs = split_list(outs, [num_out_error_vals])
@@ -985,15 +985,18 @@ def expand_errors_leading_dim(*xs):
 
   with core.extend_axis_env_nd(mesh.shape.items()):
     jaxpr, _, consts, () = pe.trace_to_jaxpr_dynamic(
-        expand_errors_leading_dim, checked_jaxpr.in_avals
+        lu.wrap_init(expand_errors_leading_dim,
+                     debug_info=checked_jaxpr.jaxpr.debug_info),
+        checked_jaxpr.in_avals
     )
   checked_jaxpr = core.ClosedJaxpr(jaxpr, consts)
 
   # Update shard_map params to account for extra error values.
   # Use fully sharded partitioning for out errors.
   new_out_names = (*([{0: mesh.axis_names}] * num_out_error_vals), *out_names)
   subfun = lu.hashable_partial(
-      lu.wrap_init(core.eval_jaxpr), checked_jaxpr.jaxpr, checked_jaxpr.consts
+      lu.wrap_init(core.eval_jaxpr, debug_info=checked_jaxpr.jaxpr.debug_info),
+      checked_jaxpr.jaxpr, checked_jaxpr.consts
   )
   new_params = dict(
       jaxpr=checked_jaxpr.jaxpr,
@@ -1007,8 +1010,10 @@ def expand_errors_leading_dim(*xs):
   return tree_unflatten(out_tree, err_and_out)
 error_checks[shard_map.shard_map_p] = shard_map_error_check
 
-def custom_jvp_call_rule(in_err, enabled_errors, *in_vals, num_consts,
-                         jvp_jaxpr_thunk, call_jaxpr, **params):
+def custom_jvp_call_rule(in_err: Error,
+                         enabled_errors: set, *in_vals, num_consts,
+                         jvp_jaxpr_fun: lu.WrappedFun,
+                         call_jaxpr: core.ClosedJaxpr, **params):
   # The types to have in mind are:
   #   jvp : (a -> b) -> (a, T a) -> (b, T b)
   #   checkify : (a -> b) -> a -> Err b
@@ -1021,10 +1026,11 @@ def custom_jvp_call_rule(in_err, enabled_errors, *in_vals, num_consts,
   err_vals, err_tree = jtu.tree_flatten(in_err)
   partial_checkify = lu.wrap_init(
       functools.partial(checkify_jaxpr_flat, call_jaxpr.jaxpr,
-                        call_jaxpr.consts, enabled_errors, err_tree))
+                        call_jaxpr.consts, enabled_errors, err_tree),
+      debug_info=call_jaxpr.jaxpr.debug_info)
   partial_checkify, f_metadata = _flatten_and_get_error_metadata_thunk(
       partial_checkify)
-  jvp = lift_jvp(err_tree.num_leaves, num_consts, jvp_jaxpr_thunk)
+  jvp = lift_jvp(err_tree.num_leaves, num_consts, jvp_jaxpr_fun)
   jvp, jvp_out_tree = flatten_fun_output(jvp)
   all_outs = custom_derivatives.custom_jvp_call_p.bind(
       partial_checkify, jvp, *err_vals, *in_vals, **params)
@@ -1041,17 +1047,17 @@ def custom_jvp_call_rule(in_err, enabled_errors, *in_vals, num_consts,
 
 # Compared to custom_derivatives.lift_jvp, we're handling the extra inputs and
 # outputs that checkify adds (just forwarding the error data's primal and
-# tangent components). The jaxpr in jvp_jaxpr_thunk doesn't expect those.
+# tangent components). The jaxpr in jvp_jaxpr_fun doesn't expect those.
 # TODO(mattjj): can we simplify this, or dedup with custom_derivatives.lift_jvp?
 # Adding another layer of lu.transformation was tricky, though maybe doable.
-def lift_jvp(num_errs, num_consts, jvp_jaxpr_thunk):
-  @lu.wrap_init
+def lift_jvp(num_errs: int, num_consts: int,
+             jvp_jaxpr_fun: lu.WrappedFun) -> lu.WrappedFun:
   def jvp(*xs):
     n, ragged = divmod(len(xs), 2)
     assert not ragged
     primals, tangents = xs[num_consts+num_errs:n], xs[n+num_consts+num_errs:]
     zeros = [type(t) is SymbolicZero for t in tangents]
-    jvp_jaxpr, jvp_consts, out_zeros = jvp_jaxpr_thunk(*zeros)
+    jvp_jaxpr, jvp_consts, out_zeros = jvp_jaxpr_fun.call_wrapped(*zeros)
     nonzero_tangents = [t for t in tangents if type(t) is not SymbolicZero]
     out = core.eval_jaxpr(jvp_jaxpr, jvp_consts, *primals, *nonzero_tangents)
     out_primals, nz_out_tangents = split_list(out, [len(out_zeros)])
@@ -1063,7 +1069,7 @@ def jvp(*xs):
     primal_errs = xs[num_consts:num_consts+num_errs]
     tangent_errs = xs[n+num_consts:n+num_consts+num_errs]
     return [*primal_errs, *out_primals, *tangent_errs, *out_tangents]
-  return jvp
+  return lu.wrap_init(jvp, debug_info=jvp_jaxpr_fun.debug_info)
 
 def custom_vjp_call_jaxpr_rule(in_err, enabled_errors, *in_vals,
                                fun_jaxpr: core.ClosedJaxpr,
 
@@ -640,7 +640,8 @@ def process_custom_jvp_call(self, primitive, fun, jvp, tracers, *,
            "to handle custom_jvp primitives")
     raise NotImplementedError(msg)
 
-  def process_custom_transpose(self, prim, call, tracers, **params):
+  def process_custom_transpose(self, prim: Primitive,
+                               call: lu.WrappedFun, tracers, **params):
     msg = (f"{type(self)} must override process_custom_transpose "
            "to handle custom_transpose_call primitives")
     raise NotImplementedError(msg)
 
@@ -141,25 +141,28 @@ def def_vmap(
 
   @traceback_util.api_boundary
   def __call__(self, *args, **kwargs):
+    debug_fun = api_util.debug_info("custom_vmap fun", self.fun,
+                                    args, kwargs)
     args = api_util.resolve_kwargs(self.fun, args, kwargs)
-    fun_name = getattr(self.fun, "__name__", str(self.fun))
     if not self.vmap_rule:
       raise AttributeError(
-          f"No batching rule defined for custom_vmap function {fun_name} "
+          f"No batching rule defined for custom_vmap function {debug_fun.func_name} "
           "using def_vmap.")
-    debug = api_util.debug_info("custom_vmap", self.fun, args, {})
     args_flat, in_tree = tree_flatten(args)
     flat_fun, out_tree = api_util.flatten_fun_nokwargs(
-        lu.wrap_init(self.fun, debug_info=debug),
+        lu.wrap_init(self.fun, debug_info=debug_fun),
         in_tree)
     in_avals = [core.get_aval(x) for x in args_flat]
     jaxpr, _, consts, () = pe.trace_to_jaxpr_dynamic(flat_fun, in_avals)
     closed_call = core.ClosedJaxpr(pe.convert_constvars_jaxpr(jaxpr), ())
     in_tree = treedef_tuple((tree_structure(consts), in_tree))
     assert self.vmap_rule is not None
+    debug_rule = api_util.debug_info("custom_vmap rule", self.vmap_rule,
+                                     (0, args, args), {})
     out_flat = custom_vmap_p.bind(*consts, *args_flat,
                                   call=closed_call,
-                                  rule=ClosedRule(self.vmap_rule),
+                                  rule=ClosedRule(self.vmap_rule,
+                                                  debug_rule),
                                   in_tree=in_tree,
                                   out_tree=out_tree())
     return tree_unflatten(out_tree(), out_flat)
@@ -170,9 +173,10 @@ def __call__(self, *args, **kwargs):
 # Define a class, instead of making a function closing over `rule`, so
 # that we can override __str__
 class ClosedRule:
-  def __init__(self, rule):
+  def __init__(self, rule: Callable, debug: core.DebugInfo):
     functools.update_wrapper(self, rule)
     self.rule = rule
+    self.debug = debug
 
   def __call__(self, axis_size, all_in_batched, *all_args):
     _, args = all_args
@@ -252,8 +256,11 @@ def custom_vmap_abstract_eval(*in_avals, call, **_):
   return call.out_avals
 
 
-def custom_vmap_jvp(primals, tangents, *, call, rule, in_tree, out_tree):
-  def jvp_of_rule_rule(axis_size, in_batched, primals, tangents):
+def custom_vmap_jvp(primals, tangents, *,
+                    call: core.ClosedJaxpr,
+                    rule: ClosedRule,
+                    in_tree: tree_util.PyTreeDef, out_tree: tree_util.PyTreeDef):
+  def jvp_of_rule_rule(axis_size: int, in_batched, primals, tangents):
     in_batched_ps, in_batched_ts = in_batched
 
     mutually_batched = tree_map(operator.and_, in_batched_ps, in_batched_ts)
@@ -281,11 +288,14 @@ def to_jvp(*primals):
       out_mutually_batched.store(out_batched)
       return out
 
+    api_util.save_wrapped_fun_sourceinfo(to_jvp, call.jaxpr.debug_info)
     def to_vmap_over_extra_batched_dims(primals, tangents):
       return api.jvp(to_jvp, primals, tangents)
 
     to_vmap_over_extra_batched_dims_flat, out_tree2 = api_util.flatten_fun_nokwargs(
-        lu.wrap_init(to_vmap_over_extra_batched_dims),
+        lu.wrap_init(to_vmap_over_extra_batched_dims,
+                     # TODO(necula): fix the debug_info calling convention
+                     debug_info=call.jaxpr.debug_info),
         tree_ps_ts)
 
     flat_out_ps_ts, flat_out_axes = vmap_unrestricted(
Original file line number	Diff line number	Diff line change
`@@ -588,6 +588,8 @@ async def main():`
`588`	`588`	`)`
`589`	`589`	`for option in args.bazel_options:`
`590`	`590`	`wheel_build_command_base.append(option)`
	`591`	`+ if "cuda" in args.wheels:`
	`592`	`+ wheel_build_command_base.append("--config=cuda_libraries_from_stubs")`
`591`	`593`
`592`	`594`	`with open(".jax_configure.bazelrc", "w") as f:`
`593`	`595`	`jax_configure_options = utils.get_jax_configure_bazel_options(wheel_build_command_base.get_command_as_list())`