pytorch
diff --git a/‎torchax/torchax/CONTRIBUTING.md‎
Lines changed: 0 additions & 38 deletions b/‎torchax/torchax/CONTRIBUTING.md‎
Lines changed: 0 additions & 38 deletions
diff --git a/‎torchax/torchax/amp.py‎
Lines changed: 13 additions & 0 deletions b/‎torchax/torchax/amp.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎torchax/torchax/config.py‎
Lines changed: 30 additions & 0 deletions b/‎torchax/torchax/config.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎torchax/torchax/decompositions.py‎
Lines changed: 14 additions & 9 deletions b/‎torchax/torchax/decompositions.py‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎torchax/torchax/device_module.py‎
Lines changed: 8 additions & 0 deletions b/‎torchax/torchax/device_module.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎torchax/torchax/export.py‎
Lines changed: 52 additions & 10 deletions b/‎torchax/torchax/export.py‎
Lines changed: 52 additions & 10 deletions
diff --git a/‎torchax/torchax/flax.py‎
Lines changed: 25 additions & 0 deletions b/‎torchax/torchax/flax.py‎
Lines changed: 25 additions & 0 deletions
@@ -57,6 +57,19 @@ def is_float(a):
 
 @contextlib.contextmanager
 def autocast(device, dtype=torch.bfloat16, env=None):
+  """A context manager for automatic mixed precision (AMP).
+
+  This context manager enables automatic mixed precision, which can improve
+  performance by using lower-precision data types for certain operations.
+
+  **Arguments:**
+
+  *   `device`: The device to use for autocasting (e.g., "cuda", "cpu").
+  *   `dtype` (`torch.dtype`, optional): The lower-precision data type to use.
+      Defaults to `torch.bfloat16`.
+  *   `env` (optional): The `torchax` environment. If not provided, the default
+      environment is used.
+  """
   del device
   if env is None:
     import torchax
 
@@ -3,6 +3,36 @@
 
 @dataclasses.dataclass
 class Configuration:
+  """A dataclass for configuring the behavior of `torchax`.
+
+  **Attributes:**
+
+  *   `debug_print_each_op` (`bool`): If `True`, prints each operation as it is
+      dispatched.
+  *   `debug_accuracy_for_each_op` (`bool`): If `True`, checks the accuracy of
+      each operation by comparing its output with the equivalent PyTorch
+      operation on the CPU.
+  *   `debug_mixed_tensor` (`bool`): If `True`, enables debugging for mixed
+      tensor operations.
+  *   `debug_print_each_op_operands` (`bool`): If `True`, prints the operands of
+      each operation.
+  *   `use_int32_for_index` (`bool`): If `True`, uses `int32` for indexing
+      operations.
+  *   `allow_mixed_math_with_scalar_tensor` (`bool`): If `True`, allows mixed
+      math operations between `torchax.Tensor` and scalar `torch.Tensor`s.
+  *   `force_materialize_views` (`bool`): If `True`, eagerly materializes `View`
+      objects into `torchax.Tensor`s.
+  *   `use_dlpack_for_data_conversion` (`bool`): If `True`, uses DLPack for
+      converting between `jax.Array` and `torch.Tensor`.
+  *   `use_tpu_flash_attention` (`bool`): If `True`, uses TPU-optimized flash
+      attention.
+  *   `shmap_flash_attention` (`bool`): If `True`, uses `shard_map` for flash
+      attention.
+  *   `treat_cuda_as_jax_device` (`bool`): If `True`, treats CUDA devices as JAX
+      devices.
+  *   `internal_respect_torch_return_dtypes` (`bool`): If `True`, respects the
+      return data types of PyTorch operations.
+  """
   debug_print_each_op: bool = False
   debug_accuracy_for_each_op: bool = False
   debug_mixed_tensor: bool = False
 
@@ -1,10 +1,10 @@
-"""This file contains some decompositons that are not available in torch stable.
+"""This file contains PyTorch operator decompositions that are not available in
+the stable version of PyTorch.
 
-Most likely from Content of
-https://github.com/pytorch/pytorch/blob/main/torch/_decomp/decompositions.py
-at main branch HEAD that we find useful here.
-
-Can also contain decompositions of a torch op in terms of other torch ops.
+The decompositions are primarily sourced from the `main` branch of the PyTorch
+repository and are included here to provide support for newer operators. This
+module can also contain decompositions of a PyTorch op in terms of other
+PyTorch ops.
 """
 
 import functools
@@ -104,18 +104,21 @@ def _reflection_or_replication_pad(
 
 
 def bernoulli(self, *, generator=None):
+  """Decomposition for the `bernoulli` operator."""
   return (torch.rand_like(self, dtype=torch.float32) < self).to(self.dtype)
 
 
 _try_register(aten.bernoulli.default, bernoulli)
 
 
 def rand_like(self, **kwargs):
+  """Decomposition for the `rand_like` operator."""
   dtype = kwargs.get("dtype", self.dtype)
   return torch.rand(self.shape, dtype=dtype)
 
 
 def channel_shuffle(self, groups):
+  """Decomposition for the `channel_shuffle` operator."""
   batchsize, channels, height, width = self.shape
   channels_per_group = channels // groups
   self = self.reshape(batchsize, groups, channels_per_group, height, width)
@@ -131,6 +134,7 @@ def channel_shuffle(self, groups):
 
 
 def bernoulli_float(self, p=0.5):
+  """Decomposition for the `bernoulli_` operator with a float probability."""
   return self.bernoulli_(p)
 
 
@@ -150,9 +154,10 @@ def _grid_sampler_3d(
     padding_mode: int = 0,
     align_corners: bool = False,
 ) -> Tensor:
-  """References: https://github.com/pytorch/pytorch/blob/06a7dc21c1005750598c37f3adbc031183c74de6/torch/_decomp/decompositions.py#L4075
+  """Decomposition for the `grid_sampler_3d` operator.
 
-  The above implement the 2d case.
+  This implementation is based on the 2D version in the PyTorch repository:
+  https://github.com/pytorch/pytorch/blob/06a7dc21c1005750598c37f3adbc031183c74de6/torch/_decomp/decompositions.py#L4075
   """
   _expand_grid = False
   torch._check(
@@ -773,4 +778,4 @@ def get_summand(ix: torch.Tensor, iy: torch.Tensor, iz: torch.Tensor,
 MUTABLE_DECOMPOSITION = [
     torch.ops.aten.bernoulli_.Tensor,
     torch.ops.aten.bernoulli_.float,
-]
+]
@@ -2,32 +2,40 @@
 
 
 def _is_in_bad_fork():
+  """Returns `False` as forking is not applicable in the same way as CUDA."""
   return False
 
 
 def manual_seed_all(seed):
+  """A placeholder for API compatibility; does not affect JAX's PRNG."""
   pass
 
 
 def device_count():
+  """Returns `1` as JAX manages devices as a single logical device."""
   return 1
 
 
 def get_rng_state():
+  """Returns an empty list for API compatibility."""
   return []
 
 
 def set_rng_state(new_state, device):
+  """A placeholder for API compatibility; does not affect JAX's PRNG."""
   pass
 
 
 def is_available():
+  """Returns `True` if JAX is available."""
   return True
 
 
 def current_device():
+  """Returns `0` as JAX manages devices as a single logical device."""
   return 0
 
 
 def get_amp_supported_dtype():
+  """Returns the data types supported by AMP (Automatic Mixed Precision)."""
   return [torch.float16, torch.bfloat16]
@@ -16,7 +16,13 @@
 
 
 class JaxInterpreter(torch.fx.Interpreter):
-  """Experimental."""
+  """An `fx.Interpreter` that executes a PyTorch FX graph using JAX.
+
+  This interpreter traverses an FX graph and replaces PyTorch operations with
+  their corresponding JAX implementations from the `torchax` operator registry.
+  It is a key component in the process of exporting PyTorch models to JAX and
+  StableHLO.
+  """
 
   def __init__(self, graph_module):
     super().__init__(graph_module)
@@ -74,11 +80,24 @@ def _extract_states_from_exported_program(exported_model):
 
 
 def exported_program_to_jax(exported_program, export_raw: bool = False):
-  """returns a pytree of jax arrays(state), and
+  """Converts a `torch.export.ExportedProgram` to a JAX-compatible function and state.
+
+  This function takes a PyTorch `ExportedProgram`, runs the necessary
+  decompositions, and returns a JAX-compatible function and the model's state
+  (parameters and buffers) as JAX arrays.
+
+  **Arguments:**
 
-  a callable(func) that is jax function.
+  *   `exported_program` (`torch.export.ExportedProgram`): The PyTorch
+      `ExportedProgram` to convert.
+  *   `export_raw` (`bool`, optional): If `True`, returns the raw states and
+      function without converting them to JAX arrays. Defaults to `False`.
 
-  func(state, input) would be how you call it.
+  **Returns:**
+
+  A tuple containing:
+  *   A pytree of JAX arrays representing the model's state.
+  *   A JAX-callable function that takes the state and inputs as arguments.
   """
   if torch.__version__ >= '2.2':
     # torch version 2.1 didn't expose this yet
@@ -115,8 +134,19 @@ def func(states, inputs):
 
 
 def extract_avals(exported):
-  """Return JAX Abstract Value shapes for all input parameters of the exported
-  program. This supports dynamic batch dimensions, including with constraints.
+  """Returns JAX abstract values (`ShapeDtypeStruct`) for all input parameters of the exported program.
+
+  This function supports dynamic batch dimensions, including those with
+  constraints.
+
+  **Arguments:**
+
+  *   `exported` (`torch.export.ExportedProgram`): The exported PyTorch program.
+
+  **Returns:**
+
+  A list of `jax.ShapeDtypeStruct` objects representing the abstract values of
+  the input parameters.
   """
 
   def _to_aval(arg_meta, symbolic_shapes):
@@ -232,12 +262,24 @@ def _build_symbolic_shape(sym, constraint, free_symbols):
 
 
 def exported_program_to_stablehlo(exported_program):
-  """Replacement for torch_xla.stablehlo.exported_program_to_stablehlo
+  """Converts a `torch.export.ExportedProgram` to StableHLO.
+
+  This function serves as a replacement for
+  `torch_xla.stablehlo.exported_program_to_stablehlo`. It supports dynamic
+  dimension sizes and generates explicit checks for Dynamo guards in the IR
+  using `shape_assertion` custom calls.
+
+  **Arguments:**
+
+  *   `exported_program` (`torch.export.ExportedProgram`): The exported PyTorch
+      program.
 
-  Convert a program exported via torch.export to StableHLO.
+  **Returns:**
 
-  This supports dynamic dimension sizes and generates explicit checks for
-  dynamo guards in the IR using shape_assertion custom_call ops.
+  A tuple containing:
+  *   The model's state (weights) as a pytree of JAX arrays.
+  *   A `jax.export.Exported` object containing the StableHLO representation of
+      the model.
   """
   weights, func = exported_program_to_jax(exported_program)
   jax_avals = extract_avals(exported_program)
 
@@ -6,8 +6,32 @@
 
 
 class FlaxNNModule(torch.nn.Module):
+  """A `torch.nn.Module` that wraps a Flax module for interoperability.
+
+  This class allows you to use a Flax module within a PyTorch model. It
+  initializes the Flax module, extracts its parameters, and wraps them in a
+  `torch.nn.ParameterDict` so they can be managed by PyTorch. The `forward`
+  pass then calls the Flax module's `apply` method with the appropriate
+  parameters.
+
+  **Attributes:**
+
+  *   `_params` (`torch.nn.Module`): A nested `torch.nn.Module` that holds the
+      parameters of the Flax module.
+  *   `_flax_module`: The original Flax module.
+  """
 
   def __init__(self, env, flax_module, sample_args, sample_kwargs=None):
+    """Initializes the `FlaxNNModule`.
+
+    **Args:**
+
+    *   `env`: The `torchax` environment.
+    *   `flax_module`: The Flax module to wrap.
+    *   `sample_args`: A tuple of sample arguments to initialize the Flax module.
+    *   `sample_kwargs` (optional): A dictionary of sample keyword arguments to
+        initialize the Flax module.
+    """
     super().__init__()
     prng = env.prng_key
     sample_kwargs = sample_kwargs or {}
@@ -34,6 +58,7 @@ def _decode_nested_dict(self, child_module):
     return result
 
   def forward(self, *args, **kwargs):
+    """Performs the forward pass by calling the wrapped Flax module."""
     nested_dict_params = self._decode_nested_dict(self._params)
     return tx.interop.call_jax(self._flax_module.apply, nested_dict_params,
                                *args, **kwargs)