From 4a622991aa85b2c376c9bf89058d36950811f748 Mon Sep 17 00:00:00 2001 From: Chun-nien Chan Date: Thu, 7 Aug 2025 16:11:51 -0700 Subject: [PATCH] Add aten.scatter.src gpu friendly decomp PiperOrigin-RevId: 792347731 --- .../fx_passes/build_aten_composite_pass.py | 2 + .../generative/examples/gemma/gemma1.py | 51 ++++++++-- .../examples/smolvlm2/verify_encoder.py | 95 +++++++++++++++++++ .../examples/smolvlm2/vision_encoder.py | 13 ++- .../experimental/torch_tfl/_decomps.py | 61 ++++++++++++ .../experimental/torch_tfl/_lowerings.py | 29 ++++++ .../odml_torch/experimental/torch_tfl/_ops.py | 12 +++ .../torch_tfl/test/test_torch_tfl_impls.py | 22 +++++ ai_edge_torch/odml_torch/export.py | 4 +- .../odml_torch/lowerings/_decomp_registry.py | 69 ++++++++++++++ .../odml_torch/lowerings/_jax_lowerings.py | 1 - .../odml_torch/optimization_barrier.py | 71 ++++++++++++++ .../odml_torch/test/test_core_aten_ops.py | 2 +- .../test/test_optimization_barrier.py | 80 ++++++++++++++++ 14 files changed, 499 insertions(+), 13 deletions(-) create mode 100644 ai_edge_torch/generative/examples/smolvlm2/verify_encoder.py create mode 100644 ai_edge_torch/odml_torch/optimization_barrier.py create mode 100644 ai_edge_torch/odml_torch/test/test_optimization_barrier.py diff --git a/ai_edge_torch/_convert/fx_passes/build_aten_composite_pass.py b/ai_edge_torch/_convert/fx_passes/build_aten_composite_pass.py index 3e92d9cd..fcc45513 100644 --- a/ai_edge_torch/_convert/fx_passes/build_aten_composite_pass.py +++ b/ai_edge_torch/_convert/fx_passes/build_aten_composite_pass.py @@ -16,6 +16,7 @@ from typing import Any, Callable from ai_edge_torch import fx_infra from ai_edge_torch import lowertools +from ai_edge_torch.odml_torch import optimization_barrier as optimization_barrier_lib import torch import torch.utils._pytree as pytree @@ -276,6 +277,7 @@ def embedding(*args, **kwargs): # Explicitly reshape back to the original shape. This places the ReshapeOp # outside of the HLFB. output = torch.reshape(output, (*(original_idx_shape), embedding_dim)) + output, _ = optimization_barrier_lib.optimization_barrier(output, idx) return output node.target = embedding diff --git a/ai_edge_torch/generative/examples/gemma/gemma1.py b/ai_edge_torch/generative/examples/gemma/gemma1.py index 0a04ed3f..c2549ff7 100644 --- a/ai_edge_torch/generative/examples/gemma/gemma1.py +++ b/ai_edge_torch/generative/examples/gemma/gemma1.py @@ -23,7 +23,7 @@ import torch from torch import nn -TENSOR_NAMES = loading_utils.ModelLoader.TensorNames( +TENSOR_NAMES_FUSED_QKV = loading_utils.ModelLoader.TensorNames( ff_up_proj="model.layers.{}.mlp.up_proj", ff_down_proj="model.layers.{}.mlp.down_proj", ff_gate_proj="model.layers.{}.mlp.gate_proj", @@ -36,6 +36,24 @@ lm_head=None, ) +TENSOR_NAMES_SEP_QKV = loading_utils.ModelLoader.TensorNames( + ff_up_proj="model.layers.{}.mlp.up_proj", + ff_down_proj="model.layers.{}.mlp.down_proj", + ff_gate_proj="model.layers.{}.mlp.gate_proj", + attn_query_proj="model.layers.{}.self_attn.q_proj", + attn_key_proj="model.layers.{}.self_attn.k_proj", + attn_value_proj="model.layers.{}.self_attn.v_proj", + attn_output_proj="model.layers.{}.self_attn.o_proj", + pre_attn_norm="model.layers.{}.input_layernorm", + post_attn_norm="model.layers.{}.post_attention_layernorm", + embedding="model.embed_tokens", + final_norm="model.norm", +) + +TENSOR_NAMES_DICT = { + "safetensors": TENSOR_NAMES_SEP_QKV, + "kaggle": TENSOR_NAMES_FUSED_QKV, +} class Gemma1(model_builder.DecoderOnlyModel): """A Gemma1 model built from the Edge Generative API layers.""" @@ -94,11 +112,28 @@ def build_2b_model( custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None, mask_cache_size: int = 0, ) -> nn.Module: - return model_builder.build_decoder_only_model( - checkpoint_path=checkpoint_path, - config=get_model_config_2b(), - tensor_names=TENSOR_NAMES, - model_class=Gemma1, - custom_loader=custom_loader, - mask_cache_size=mask_cache_size, + + # A list to store the reasons for each failure + key_errors = [] + + for tensor_names in TENSOR_NAMES_DICT.values(): + try: + return model_builder.build_decoder_only_model( + checkpoint_path=checkpoint_path, + config=get_model_config_2b(), + tensor_names=tensor_names, + model_class=Gemma1, + custom_loader=custom_loader, + mask_cache_size=mask_cache_size, + ) + except KeyError as ke: + # Store the specific key that was missing for later + key_errors.append(f"Missing key: {ke}") + continue + + # If the loop finishes, raise an error with all the collected details + error_details = "\n".join(key_errors) + raise RuntimeError( + "Failed to build model after trying all configurations. " + f"Encountered the following errors:\n{error_details}" ) diff --git a/ai_edge_torch/generative/examples/smolvlm2/verify_encoder.py b/ai_edge_torch/generative/examples/smolvlm2/verify_encoder.py new file mode 100644 index 00000000..7811821b --- /dev/null +++ b/ai_edge_torch/generative/examples/smolvlm2/verify_encoder.py @@ -0,0 +1,95 @@ +# Copyright 2024 The AI Edge Torch Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Verifies the reauthored SmolVLM2 Image Encoder model.""" + +import logging + +from absl import app +from absl import flags +from ai_edge_torch.generative.examples.smolvlm2 import smolvlm2 +from ai_edge_torch.generative.examples.smolvlm2 import vision_encoder +from PIL import Image +import requests +import torch +import transformers + +_IMAGE_URL = flags.DEFINE_string( + "image_url", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true", + "The image URI to encode.", +) + +_CHECKPOINT = flags.DEFINE_string( + "checkpoint", + "HuggingFaceTB/SmolVLM2-2.2B-Instruct", + "The checkpoint to verify.", +) + +_REAUTHORTHED_CHECKPOINT = flags.DEFINE_string( + "pretrained_weights_path", + None, + "The path to the model's pretrained weights.", +) + + +def main(_): + checkpoint = _CHECKPOINT.value + logging.info("Loading the original model from: %s", checkpoint) + original_model = transformers.AutoModelForImageTextToText.from_pretrained( + checkpoint + ) + original_model = original_model.eval().model + + logging.info("Building the reauthored checkpoint from: %s", checkpoint) + reauthored_checkpoint = _REAUTHORTHED_CHECKPOINT.value + if reauthored_checkpoint is None: + raise ValueError("reauthored_checkpoint is required.") + + logging.info("Building the reauthored model from: %s", reauthored_checkpoint) + reauthored_model = vision_encoder.build_image_encoder(reauthored_checkpoint) + + logging.info("Loading the tokenizer from: %s", checkpoint) + processor = transformers.AutoProcessor.from_pretrained(checkpoint) + + logging.info("Loading the image from: %s", _IMAGE_URL.value) + image = Image.open(requests.get(_IMAGE_URL.value, stream=True).raw) + pixel_values = processor(images=image, return_tensors="pt")["pixel_values"] + + logging.info("Forwarding the original model...") + outputs_original = original_model.get_image_features(pixel_values) + logging.info("outputs_original's shape: %s", outputs_original.shape) + + pixel_values = pixel_values.reshape( + pixel_values.shape[0] * pixel_values.shape[1], *pixel_values.shape[2:] + ) + logging.info("Forwarding the reauthored model...") + outputs_reauthored = reauthored_model.forward( + pixel_values=pixel_values + ) + logging.info("outputs_reauthored's shape: %s", outputs_reauthored.shape) + + try: + assert torch.allclose( + outputs_original, outputs_reauthored, atol=1e-03, rtol=1e-04 + ) + except AssertionError as e: + logging.error("*** FAILED *** verify with an image") + raise e + else: + logging.info("*** PASSED *** verify with an image") + + +if __name__ == "__main__": + app.run(main) diff --git a/ai_edge_torch/generative/examples/smolvlm2/vision_encoder.py b/ai_edge_torch/generative/examples/smolvlm2/vision_encoder.py index 676c5c32..3b5d15da 100644 --- a/ai_edge_torch/generative/examples/smolvlm2/vision_encoder.py +++ b/ai_edge_torch/generative/examples/smolvlm2/vision_encoder.py @@ -129,7 +129,18 @@ def forward( pixel_values: torch.Tensor, export_config: export_cfg.ExportConfig = None, ) -> torch.Tensor: - x = self.siglip_encoder(pixel_values) + # Embed the image according to SiplipVisionEmbeddings. + x = self.siglip_encoder.tok_embedding(pixel_values) + x = x.flatten(2).transpose(1, 2) + x = x + self.siglip_encoder.tok_embedding_position + + # Pass a dummy mask because SDPA attention impl expects non-None mask. + mask = torch.zeros(x.shape[0], 1, x.shape[1], x.shape[1]) + for _, block in enumerate(self.siglip_encoder.transformer_blocks): + x = block(x, mask=mask) + x = self.siglip_encoder.final_norm(x) + + # Project the image embeddings to text hidden size. x = self.connector(x) return x diff --git a/ai_edge_torch/odml_torch/experimental/torch_tfl/_decomps.py b/ai_edge_torch/odml_torch/experimental/torch_tfl/_decomps.py index 250118ef..3e249c77 100644 --- a/ai_edge_torch/odml_torch/experimental/torch_tfl/_decomps.py +++ b/ai_edge_torch/odml_torch/experimental/torch_tfl/_decomps.py @@ -180,6 +180,11 @@ def _aten_rsqrt_decomp(x): return torch.ops.tfl.rsqrt(x) +@register_decomp(torch.ops.aten.neg.default) +def _aten_neg_decomp(x): + return torch.ops.tfl.neg(x) + + @register_decomp(torch.ops.aten.gelu.default) def _aten_gelu_decomp(x, approximate="none"): return torch.ops.tfl.gelu(x, approximate != "none") @@ -317,6 +322,38 @@ def _aten_select_int_decomp(x, dim, index): return torch.ops.tfl.squeeze(sliced, [dim]) +@register_decomp(torch.ops.aten.slice.Tensor) +def _aten_slice_tensor_decomp(x, dim=0, start=None, end=None, step=1): + rank = x.dim() + dim_size = x.shape[dim] + + # Initialize begin, end, strides for tfl.strided_slice + begin = [0] * rank + end_vec = list(x.shape) + strides = [1] * rank + + # The logic below is to match PyTorch's `slice` behavior. + # `start` and `end` can be negative, which means they count from the end. + # `start=None` defaults to 0. + # `end=None` or a large number defaults to `dim_size` after clamping. + + start_val = 0 if start is None else start + if start_val < 0: + start_val += dim_size + + end_val = dim_size if end is None else end + if end_val < 0: + end_val += dim_size + + # Clamp start and end to be within the dimension size, following PyTorch's + # logic. + start_val = max(0, min(start_val, dim_size)) + end_val = max(start_val, min(end_val, dim_size)) + + begin[dim], end_vec[dim], strides[dim] = start_val, end_val, step + return torch.ops.tfl.strided_slice(x, begin, end_vec, strides) + + @register_decomp(torch.ops.aten.where.self) def _aten_where_self_decomp(condition, x, y): x, y = _promote_types_for_binary_op(x, y) @@ -351,3 +388,27 @@ def _aten__softmax_decomp( softmax_result = torch.ops.tfl.softmax(x_permuted) # Transpose the result back to the original dimensions. return torch.ops.tfl.transpose(softmax_result, dims) + + +@register_decomp(torch.ops.aten.topk.default) +def _aten_topk_decomp(self, k, dim=-1, largest=True, sorted=True): + if not largest: + raise ValueError("Only largest=True is supported for torch.topk.") + + if dim < 0: + dim = self.dim() + dim + + if dim != self.dim() - 1: + self = torch.transpose(self, dim, -1) + + # Ignores sorted value: tfl.topk_v2 only supports sorted=True, but it doesn't + # affect the correctness of the output. + out, indices = torch.ops.tfl.topk_v2(self, k) + + if dim != self.dim() - 1: + out = torch.transpose(out, dim, -1) + indices = torch.transpose(indices, dim, -1) + + # torch.topk returns int64 indices, but tfl.topk_v2 returns indices in int32. + indices = indices.to(torch.int64) + return out, indices diff --git a/ai_edge_torch/odml_torch/experimental/torch_tfl/_lowerings.py b/ai_edge_torch/odml_torch/experimental/torch_tfl/_lowerings.py index 5d3425a7..a2861b8b 100644 --- a/ai_edge_torch/odml_torch/experimental/torch_tfl/_lowerings.py +++ b/ai_edge_torch/odml_torch/experimental/torch_tfl/_lowerings.py @@ -286,6 +286,18 @@ def _tfl_rsqrt_lowering( ) +@lower(torch.ops.tfl.neg.default) +def _tfl_neg_lowering( + lctx: LoweringContext, + x: ir.Value, +) -> ir.Value: + return _ir_operation( + "tfl.neg", + results=lowering_utils.node_meta_to_ir_types(lctx.node), + operands=[x], + ) + + @lower(torch.ops.tfl.gelu.default) def _tfl_gelu_lowering( lctx: LoweringContext, @@ -674,3 +686,20 @@ def _tfl_softmax_lowering( "beta": ir.FloatAttr.get(ir.F32Type.get(), beta), }, ) + + +@lower(torch.ops.tfl.topk_v2.default) +def _tfl_topk_v2_lowering( + lctx: LoweringContext, + x: ir.Value, + k: int, +) -> tuple[ir.Value, ir.Value]: + return _ir_operation( + "tfl.topk_v2", + results=lowering_utils.node_meta_to_ir_types(lctx.node), + operands=[ + x, + lowering_utils.numpy_array_constant(np.array(k, dtype=np.int32)), + ], + attributes={}, + ) diff --git a/ai_edge_torch/odml_torch/experimental/torch_tfl/_ops.py b/ai_edge_torch/odml_torch/experimental/torch_tfl/_ops.py index 9364a487..95e1e534 100644 --- a/ai_edge_torch/odml_torch/experimental/torch_tfl/_ops.py +++ b/ai_edge_torch/odml_torch/experimental/torch_tfl/_ops.py @@ -110,6 +110,11 @@ def tfl_rsqrt(x: torch.Tensor) -> torch.Tensor: return torch.rsqrt(x) +@custom_op_with_fake("tfl::neg") +def tfl_neg(x: torch.Tensor) -> torch.Tensor: + return torch.neg(x) + + @custom_op_with_fake("tfl::gelu") def tfl_gelu(x: torch.Tensor, approximate: bool = False) -> torch.Tensor: gelu_approximate = "tanh" if approximate else "none" @@ -292,6 +297,13 @@ def tfl_softmax(x: torch.Tensor) -> torch.Tensor: return torch.nn.functional.softmax(x, dim=-1) +@custom_op_with_fake("tfl::topk_v2") +def tfl_topk_v2(x: torch.Tensor, k: int) -> tuple[torch.Tensor, torch.Tensor]: + out, indices = torch.topk(x, k, dim=-1, largest=True, sorted=True) + indices = indices.to(torch.int32) + return out, indices + + @custom_op_with_fake( "tfl::slice", schema="(Tensor x, SymInt[] begin, SymInt[] size) -> Tensor" ) diff --git a/ai_edge_torch/odml_torch/experimental/torch_tfl/test/test_torch_tfl_impls.py b/ai_edge_torch/odml_torch/experimental/torch_tfl/test/test_torch_tfl_impls.py index 6f2250aa..28103bb7 100644 --- a/ai_edge_torch/odml_torch/experimental/torch_tfl/test/test_torch_tfl_impls.py +++ b/ai_edge_torch/odml_torch/experimental/torch_tfl/test/test_torch_tfl_impls.py @@ -30,6 +30,14 @@ export_with_tensor_inputs_only = testing.export_with_tensor_inputs_only +def tree_map_list_to_tuple(x): + if isinstance(x, (list, tuple)): + return tuple(tree_map_list_to_tuple(y) for y in x) + if isinstance(x, dict): + return {k: tree_map_list_to_tuple(v) for k, v in x.items()} + return x + + def rnd(dtype, shape, min_v=None, max_v=None): """Shortcut for creating a random torch tensor.""" if dtype in (torch.int32, torch.int64, torch.bool): @@ -98,6 +106,9 @@ def _assert_export_and_close( actual = edge_model(*args, **kwargs) with self.subTest("torch_convert_eval_diff:" + str(atol)): + expected = tree_map_list_to_tuple(expected) + actual = tree_map_list_to_tuple(actual) + expected_flat, expected_spec = pytree.tree_flatten(expected) actual_flat, actual_spec = pytree.tree_flatten(actual) @@ -152,6 +163,7 @@ def _assert_export_and_close( ("aten_cos_1", torch.ops.aten.cos.default, (rnd(torch.float32, (1, 10)),), dict()), ("aten_rsqrt_0", torch.ops.aten.rsqrt.default, (rnd(torch.float32, (10, 10)),), dict()), ("aten_rsqrt_1", torch.ops.aten.rsqrt.default, (rnd(torch.float32, (1, 10)),), dict()), + ("aten_neg_0", torch.ops.aten.neg.default, (rnd(torch.float32, (10, 10)),), dict()), ("aten_gelu_0", torch.ops.aten.gelu.default, (rnd(torch.float32, (10, 10)),), dict()), ("aten_gelu_1", torch.ops.aten.gelu.default, (rnd(torch.float32, (10, 10)),), dict(approximate="tanh")), ("aten_gelu_2", torch.ops.aten.gelu.default, (rnd(torch.float32, (1, 10)),), dict()), @@ -186,6 +198,14 @@ def _assert_export_and_close( ("aten_squeeze_dims_0", torch.ops.aten.squeeze.dims, (rnd(torch.float32, (2, 1, 2, 1, 2)), [1, 2, 3],), dict()), ("aten_select_int_0", torch.ops.aten.select.int, (rnd(torch.float32, (2, 3, 4)), 0, 1,), dict()), ("aten_select_int_1", torch.ops.aten.select.int, (rnd(torch.float32, (2, 3, 4)), 1, 1,), dict()), + ("aten_slice_tensor_0", torch.ops.aten.slice.Tensor, (rnd(torch.float32, (10, 10)),), dict(dim=0, start=1, end=3)), + ("aten_slice_tensor_1", torch.ops.aten.slice.Tensor, (rnd(torch.float32, (10, 10)),), dict(dim=1, start=2, end=5)), + ("aten_slice_tensor_2", torch.ops.aten.slice.Tensor, (rnd(torch.float32, (10, 10)),), dict(dim=0, start=None, end=5)), + ("aten_slice_tensor_3", torch.ops.aten.slice.Tensor, (rnd(torch.float32, (10, 10)),), dict(dim=0, start=2, end=None)), + ("aten_slice_tensor_4", torch.ops.aten.slice.Tensor, (rnd(torch.float32, (10, 10)),), dict(dim=0, start=-5, end=-2)), + ("aten_slice_tensor_5", torch.ops.aten.slice.Tensor, (rnd(torch.float32, (10, 10)),), dict(dim=0, start=1, end=8, step=2)), + ("aten_slice_tensor_6", torch.ops.aten.slice.Tensor, (rnd(torch.float32, (10, 10)),), dict(dim=1, start=2, end=100)), + ("aten_slice_tensor_7", torch.ops.aten.slice.Tensor, (rnd(torch.float32, (10, 10)),), dict(dim=0, start=None, end=None)), ("aten_where_self_0", torch.ops.aten.where.self, (rnd(torch.bool, (10, 10)), rnd(torch.float32, (10, 10)), rnd(torch.float32, (10, 10)),), dict()), ("aten_embedding_0", torch.ops.aten.embedding.default, (rnd(torch.float32, (10, 10)), torch.tensor([[0, 2, 4, 6, 8], [1, 3, 5, 7, 9]]),), dict()), ("aten__softmax_0", torch.ops.aten._softmax.default, (rnd(torch.float32, (10, 10)), -1, False), dict()), @@ -194,6 +214,8 @@ def _assert_export_and_close( ("aten__softmax_3", torch.ops.aten._softmax.default, (rnd(torch.float32, (1, 10)), 0, False), dict()), ("aten__softmax_4", torch.ops.aten._softmax.default, (rnd(torch.float32, (10, 10)), 1, False), dict()), ("aten__softmax_5", torch.ops.aten._softmax.default, (rnd(torch.float32, (1, 10)), 1, False), dict()), + ("aten_topk_0", torch.ops.aten.topk.default, (rnd(torch.float32, (4, 10)), 3), dict()), + ("aten_topk_1", torch.ops.aten.topk.default, (rnd(torch.float32, (4, 10)), 3), dict(dim=0)), # fmt: on # pyformat: enable ) diff --git a/ai_edge_torch/odml_torch/export.py b/ai_edge_torch/odml_torch/export.py index 2783af11..8e8c17e4 100644 --- a/ai_edge_torch/odml_torch/export.py +++ b/ai_edge_torch/odml_torch/export.py @@ -21,7 +21,7 @@ from typing import Any, Callable, Optional from ai_edge_torch import fx_infra -from jax.lib import xla_extension +import jax.extend from jax._src.lib.mlir import ir from jax._src.lib.mlir.dialects import func from jax._src.lib.mlir.dialects import hlo as stablehlo @@ -233,7 +233,7 @@ def module_bytecode_vhlo(self) -> bytes: target_version = stablehlo.get_version_from_compatibility_requirement( stablehlo.StablehloCompatibilityRequirement.WEEK_12 ) - module_bytecode = xla_extension.mlir.serialize_portable_artifact( + module_bytecode = jax.extend.mlir.serialize_portable_artifact( self.module_bytecode, target_version ) return module_bytecode diff --git a/ai_edge_torch/odml_torch/lowerings/_decomp_registry.py b/ai_edge_torch/odml_torch/lowerings/_decomp_registry.py index bf390d69..d480c6e6 100644 --- a/ai_edge_torch/odml_torch/lowerings/_decomp_registry.py +++ b/ai_edge_torch/odml_torch/lowerings/_decomp_registry.py @@ -14,6 +14,7 @@ # ============================================================================== """Torch export decompositions to run before lowering.""" +import functools from ai_edge_torch import fx_infra import torch @@ -65,3 +66,71 @@ torch.ops.aten._safe_softmax.default, torch.softmax, ) + + +# Decomp torch.scatter into one_hot, broadcasting, mul, and selects. +# This is a more GPU-friendly implementation than default +# lowering via stablehlo.scatter or tfl.scatter_nd. +@functools.partial( + fx_infra.decomp.add_pre_convert_decomp, torch.ops.aten.scatter.src +) +def _scatter_impl( + self: torch.Tensor, dim: int, index: torch.Tensor, src: torch.Tensor +) -> torch.Tensor: + if dim < 0: + dim = self.dim() + dim + + # --- 1. Slice `src` to match the shape of `index` --- + slicing_idx_for_src = tuple(slice(s) for s in index.shape) + src_sliced = src[slicing_idx_for_src] + + # --- 2. Compute updates for the relevant slice using one_hot --- + num_classes = self.shape[dim] + one_hot_indices = torch.nn.functional.one_hot(index, num_classes) + slice_updates_unaggregated = src_sliced.unsqueeze(-1) * one_hot_indices + slice_updates_summed = slice_updates_unaggregated.sum(dim=dim) + slice_condition_summed = one_hot_indices.any(dim=dim) + + # --- 3. Permute the computed slice to the correct dimension order --- + n_dims = self.dim() + slice_updates = slice_updates_summed + slice_condition = slice_condition_summed + if n_dims > 1: + permute_order = list(range(n_dims - 1)) + permute_order.insert(dim, n_dims - 1) + slice_updates = slice_updates_summed.permute(permute_order) + slice_condition = slice_condition_summed.permute(permute_order) + + # Pad the smaller tensors to match the shape of `self`. + require_padding = True + try: + shape = torch.broadcast_shapes(slice_updates.shape, self.shape) + if shape == self.shape: + require_padding = False + except RuntimeError: + # Shapes are not broadcastable. + require_padding = True + pass + + if require_padding: + pad_amounts = [] + for i in range(n_dims - 1, -1, -1): + padding_needed = self.shape[i] - slice_updates.shape[i] + # Add 0 for the "start" and the needed amount for the "end" + pad_amounts.extend([0, padding_needed]) + updates_tensor = torch.nn.functional.pad( + slice_updates, pad_amounts, "constant", 0 + ) + condition_mask = torch.nn.functional.pad( + slice_condition, pad_amounts, "constant", 0 + ) + else: + updates_tensor = slice_updates + condition_mask = slice_condition + + # --- 5. Use `torch.where` on correctly-sized tensors --- + # IMPORTANT NOTE: When indices are not unique, the behavior of torch scatter + # is non-deterministic (one of the values from src will be picked + # arbitrarily) + result = torch.where(condition_mask, updates_tensor, self) + return result diff --git a/ai_edge_torch/odml_torch/lowerings/_jax_lowerings.py b/ai_edge_torch/odml_torch/lowerings/_jax_lowerings.py index 41cdf033..09e1b5b0 100644 --- a/ai_edge_torch/odml_torch/lowerings/_jax_lowerings.py +++ b/ai_edge_torch/odml_torch/lowerings/_jax_lowerings.py @@ -193,7 +193,6 @@ def lower_by_torch_xla2(op): lower_by_torch_xla2(torch.ops.aten.round) lower_by_torch_xla2(torch.ops.aten.rsqrt) lower_by_torch_xla2(torch.ops.aten.scalar_tensor) -lower_by_torch_xla2(torch.ops.aten.scatter.src) lower_by_torch_xla2(torch.ops.aten.scatter.value) lower_by_torch_xla2(torch.ops.aten.scatter_add) lower_by_torch_xla2(torch.ops.aten.scatter_reduce) diff --git a/ai_edge_torch/odml_torch/optimization_barrier.py b/ai_edge_torch/odml_torch/optimization_barrier.py new file mode 100644 index 00000000..88778b37 --- /dev/null +++ b/ai_edge_torch/odml_torch/optimization_barrier.py @@ -0,0 +1,71 @@ +# Copyright 2025 The AI Edge Torch Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Optimization barrier op definition and lowering.""" + +from ai_edge_torch.odml_torch import _torch_library +from ai_edge_torch.odml_torch.lowerings import registry +from jax._src.lib.mlir import ir +from jax._src.lib.mlir.dialects import hlo as stablehlo +import torch +import torch.utils._pytree as pytree + +_torch_library.ODML_TORCH_LIB.define( + "optimization_barrier(Tensor[] inputs) -> Tensor[]" +) + +optimization_barrier_op = torch.ops.odml_torch.optimization_barrier.default + + +def optimization_barrier(*inputs: pytree.PyTree): + """Apply optimization barrier to the tensors nested within arbitrary pytrees. + + Args: + *inputs: A list of tensors or tensor pytrees. + + Returns: + The tensors after optimization barrier in the same pytrees structures. + """ + if len(inputs) == 1: + inputs = inputs[0] + tensors, spec = pytree.tree_flatten(inputs) + tensors = optimization_barrier_op(tuple(tensors)) + outputs = pytree.tree_unflatten(tensors, spec) + return outputs + + +@torch.library.impl( + _torch_library.ODML_TORCH_LIB, + "optimization_barrier", + "CompositeExplicitAutograd", +) +def _optimization_barrier_impl(inputs: tuple[torch.Tensor, ...]): + return tuple(inputs) + + +@torch.library.impl( + _torch_library.ODML_TORCH_LIB, + "optimization_barrier", + "Meta", +) +def _optimization_barrier_fake(inputs: tuple[torch.Tensor, ...]): + return tuple([torch.empty_like(x) for x in inputs]) + + +@registry.lower(torch.ops.odml_torch.optimization_barrier.default) +def _optimization_barrier_lowering( + lctx, inputs: tuple[ir.Value, ...] +) -> ir.Value: + del lctx + return stablehlo.optimization_barrier(inputs) diff --git a/ai_edge_torch/odml_torch/test/test_core_aten_ops.py b/ai_edge_torch/odml_torch/test/test_core_aten_ops.py index 40bdc0f8..ca58e55d 100644 --- a/ai_edge_torch/odml_torch/test/test_core_aten_ops.py +++ b/ai_edge_torch/odml_torch/test/test_core_aten_ops.py @@ -342,7 +342,7 @@ def _run_export_and_compare( ("aten_rsub_Scalar_0", torch.ops.aten.rsub.Scalar, (rnd(torch.float32, (10, 10)), 0.123,), dict()), ("aten_scatter_add_0", torch.ops.aten.scatter_add, (rnd(torch.float32, (10, 10)), 1, rnd(torch.int64, (2, 2)), rnd(torch.float32, (10, 10)),), dict()), ("aten_scatter_reduce_two_0", torch.ops.aten.scatter_reduce.two, (rnd(torch.float32, (10, 10)), 1, rnd(torch.int64, (10, 10)), rnd(torch.float32, (10, 10)), "sum",), dict()), - ("aten_scatter_src_0", torch.ops.aten.scatter.src, (rnd(torch.float32, (10, 10)), 1, rnd(torch.int64, (10, 10)), rnd(torch.float32, (10, 10)),), dict()), + ("aten_scatter_src_0", torch.ops.aten.scatter.src, (rnd(torch.float32, (3, 5)), 0, torch.tensor([[0, 1, 2, 0]]), rnd(torch.float32, (2, 5)),), dict()), ("aten_scatter_value_0", torch.ops.aten.scatter.value, (rnd(torch.float32, (10, 10)), 1, rnd(torch.int64, (10, 10)), 1,), dict()), ("aten_select_copy_int_0", torch.ops.aten.select_copy.int, (rnd(torch.float32, (10, 10)), 1, 0,), dict()), ("aten_select_int_0", torch.ops.aten.select.int, (rnd(torch.float32, (10, 10)), 1, 1,), dict()), diff --git a/ai_edge_torch/odml_torch/test/test_optimization_barrier.py b/ai_edge_torch/odml_torch/test/test_optimization_barrier.py new file mode 100644 index 00000000..d25d8c8e --- /dev/null +++ b/ai_edge_torch/odml_torch/test/test_optimization_barrier.py @@ -0,0 +1,80 @@ +# Copyright 2025 The AI Edge Torch Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from ai_edge_torch import odml_torch +from ai_edge_torch.odml_torch import optimization_barrier as optimization_barrier_lib # Import to register the op. +import torch + +from absl.testing import absltest as googletest + +optimization_barrier = optimization_barrier_lib.optimization_barrier + + +class TestOptimizationBarrier(googletest.TestCase): + """Test optimization barrier op implementation and lowering.""" + + def test_applied_optimization_barrier_op(self): + """Test optimization barrier op application and lowering.""" + + class TestModel(torch.nn.Module): + + def forward(self, x, y): + x, _ = optimization_barrier(x, y) + return x + + x = torch.randn(1, 5) + ep = torch.export.export(TestModel().eval(), (x, x)) + mlir = odml_torch.export.exported_program_to_mlir(ep) + mlir_text = mlir.get_text() + self.assertEqual( + mlir_text.count( + "stablehlo.optimization_barrier %arg1, %arg1 : tensor<1x5xf32>," + " tensor<1x5xf32>" + ), + 1, + ) + + def test_input_single_tensor(self): + """Test optimization barrier with single tensor input.""" + x = torch.randn(1, 5) + y = optimization_barrier(x) + self.assertIsInstance(y, torch.Tensor) + self.assertEqual(y.shape, (1, 5)) + + def test_input_multiple_tensors(self): + """Test optimization barrier with multiple tensors input.""" + x = torch.randn(1, 5) + y = torch.randn(1, 6) + z = optimization_barrier(x, y) + self.assertIsInstance(z, tuple) + self.assertLen(z, 2) + self.assertIsInstance(z[0], torch.Tensor) + self.assertIsInstance(z[1], torch.Tensor) + self.assertEqual(z[0].shape, (1, 5)) + self.assertEqual(z[1].shape, (1, 6)) + + def test_input_nested_tensors(self): + """Test optimization barrier with nested tensor inputs.""" + x = {"foo": torch.randn(1, 5), "bar": torch.randn(1, 6)} + z = optimization_barrier(x) + self.assertIsInstance(z, dict) + self.assertLen(z, 2) + self.assertIsInstance(z["foo"], torch.Tensor) + self.assertIsInstance(z["bar"], torch.Tensor) + self.assertEqual(z["foo"].shape, (1, 5)) + self.assertEqual(z["bar"].shape, (1, 6)) + + +if __name__ == "__main__": + googletest.main()