openxla
diff --git a/‎xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td‎
Lines changed: 10 additions & 6 deletions b/‎xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_remote_access.mlir‎
Lines changed: 59 additions & 12 deletions b/‎xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_remote_access.mlir‎
Lines changed: 59 additions & 12 deletions
diff --git a/‎xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_remote_access_pass.cc‎
Lines changed: 26 additions & 10 deletions b/‎xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_remote_access_pass.cc‎
Lines changed: 26 additions & 10 deletions
diff --git a/‎xla/backends/gpu/runtime/all_reduce_test.cc‎
Lines changed: 23 additions & 5 deletions b/‎xla/backends/gpu/runtime/all_reduce_test.cc‎
Lines changed: 23 additions & 5 deletions
@@ -401,22 +401,26 @@ def TTXLA_GetRankOp : TTXLA_Op<"get_rank", [Pure]> {
 def TTXLA_GetPeerPtrOp : TTXLA_Op<"get_peer_ptr", [Pure]> {
   let summary = [{
     Extract the pointer to the given symmetric memory `address` on the given
-    `peer` device using the symmetric memory `metadata`.
-    For this an operation first calculates an offset of the `address` to the
-    current rank symmetric memory range, and the adds this offset to the 
-    symmetric memory range of the `peer` device.
+    `peer` device. An `address` should point to the memory of the given kernel
+    argument with `argument_index`. The result is calculated using the symmetric
+    memory `metadata` constructed at the runtime.
+    To calculate offsets operation also need to know the number of devices
+    participating in the collective operation (`world_size`).
   }];
   let arguments = (ins
     Arg<TT_PtrLike, "",
       [MemRead<GlobalMemory>]>:$address,
     I64:$peer_id,
     Arg<TT_PtrLike, "",
-      [MemRead<GlobalMemory>]>:$metadata);
+      [MemRead<GlobalMemory>]>:$metadata,
+    I32Attr:$argument_index,
+    // The number of devices participating in the collective operation.
+    I32Attr:$world_size);
 
   let results = (outs Arg<TT_PtrLike, "", [MemRead<GlobalMemory>]>:$result);
 
   let assemblyFormat = [{
-    $address `,` $peer_id `,` $metadata attr-dict `:`
+    $address `,` $peer_id `,` $metadata `,` attr-dict `:`
     functional-type(operands, results)
   }];
 }
 
@@ -12,39 +12,86 @@ tt.func @get_rank(
 }
 
 tt.func @get_peer_ptr(
-  %arg0: !tt.ptr<i64>, %peer_id: i64, %metadata: !tt.ptr<i64>
+  %arg0: !tt.ptr<i64>, %arg1: !tt.ptr<i64>, %peer_id: i64, %metadata: !tt.ptr<i64>
 ) -> !tt.ptr<i64> {
   // CHECK-NOT: triton_xla.get_peer_ptr
-  // Byte size of a pointer.
+  // An offset from the beginning of metadata to the peer pointers for the %arg1
+  // offset(param_to_peers) + sizeof(uint64_t) * 2 = 20
+  // CHECK: %c24_i64 = arith.constant 24 : i64
+  // Size of the uint64_t.
   // CHECK: %c8_i64 = arith.constant 8 : i64
 
   // Load metadata->rank
-  // CHECK-NEXT: %0 = tt.load %arg2 : !tt.ptr<i64>
+  // CHECK-NEXT: %0 = tt.load %arg3 : !tt.ptr<i64>
 
   // Calculate offset to current base pointer.
   // CHECK-NEXT: %1 = arith.muli %0, %c8_i64 : i64
 
-  // Load metadata->buffer_root_ptrs[metadata->rank].
+  // Load metadata->param_to_peers[argument_offset + metadata->rank].
+  // Here argument_offset = 0 since %arg0 is the first argument.
   // CHECK-NEXT: %2 = arith.addi %1, %c8_i64 : i64
-  // CHECK-NEXT: %3 = tt.addptr %arg2, %2 : !tt.ptr<i64>, i64
+  // CHECK-NEXT: %3 = tt.addptr %arg3, %2 : !tt.ptr<i64>, i64
   // CHECK-NEXT: %4 = tt.load %3 : !tt.ptr<i64>
 
   // Calculate offset to address.
   // CHECK-NEXT: %5 = tt.ptr_to_int %arg0 : !tt.ptr<i64> -> i64
   // CHECK-NEXT: %6 = arith.subi %5, %4 : i64
 
   // Calculate offset to peer base pointer.
-  // CHECK-NEXT: %7 = arith.muli %arg1, %c8_i64 : i64
+  // CHECK-NEXT: %7 = arith.muli %arg2, %c8_i64 : i64
   // CHECK-NEXT: %8 = arith.addi %7, %c8_i64 : i64
 
-  // Load metadata->buffer_root_ptrs[peer_id].
-  // CHECK-NEXT: %9 = tt.addptr %arg2, %8 : !tt.ptr<i64>, i64
+  // Load metadata->peer_base_ptrs[argument_offset + peer_id].
+  // CHECK-NEXT: %9 = tt.addptr %arg3, %8 : !tt.ptr<i64>, i64
   // CHECK-NEXT: %10 = tt.load %9 : !tt.ptr<i64>
 
-  // Load metadata->buffer_root_ptrs[peer_id] + offset.
+  // Load metadata->buffer_root_ptrs[argument_offset + peer_id] + offset.
   // CHECK-NEXT: %11 = arith.addi %10, %6 : i64
   // CHECK-NEXT: %12 = tt.int_to_ptr %11 : i64 -> !tt.ptr<i64>
-  // CHECK-NEXT: tt.return %12 : !tt.ptr<i64>
-  %peer_ptr = triton_xla.get_peer_ptr %arg0, %peer_id, %metadata : (!tt.ptr<i64>, i64, !tt.ptr<i64>) -> !tt.ptr<i64>
-  tt.return %peer_ptr : !tt.ptr<i64>
+  %arg_0_peer_ptr = triton_xla.get_peer_ptr %arg0, %peer_id, %metadata,
+     { argument_index = 0 : i32, world_size = 2 : i32 } :
+     (!tt.ptr<i64>, i64, !tt.ptr<i64>) -> !tt.ptr<i64>
+
+  // Load metadata->rank
+  // CHECK-NEXT: %13 = tt.load %arg3 : !tt.ptr<i64>
+  // Calculate offset to current base pointer.
+  // CHECK-NEXT: %14 = arith.muli %13, %c8_i64 : i64
+  // Load metadata->param_to_peers[argument_offset + metadata->rank].
+  // CHECK-NEXT: %15 = arith.addi %14, %c24_i64 : i64
+  // CHECK-NEXT: %16 = tt.addptr %arg3, %15 : !tt.ptr<i64>, i64
+  // CHECK-NEXT: %17 = tt.load %16 : !tt.ptr<i64>
+  // Calculate offset to address.
+  // CHECK-NEXT: %18 = tt.ptr_to_int %arg1 : !tt.ptr<i64> -> i64
+  // CHECK-NEXT: %19 = arith.subi %18, %17 : i64
+
+  // Calculate offset to peer base pointer.
+  // CHECK-NEXT: %20 = arith.muli %arg2, %c8_i64 : i64
+  // CHECK-NEXT: %21 = arith.addi %20, %c24_i64 : i64
+
+  // Load metadata->peer_base_ptrs[argument_offset + peer_id].
+  // CHECK-NEXT: %22 = tt.addptr %arg3, %21 : !tt.ptr<i64>, i64
+  // CHECK-NEXT: %23 = tt.load %22 : !tt.ptr<i64>
+
+  // Load metadata->buffer_root_ptrs[argument_offset + peer_id] + offset.
+  // CHECK-NEXT: %24 = arith.addi %23, %19 : i64
+  // CHECK-NEXT: %25 = tt.int_to_ptr %24 : i64 -> !tt.ptr<i64>
+
+  %arg_1_peer_ptr = triton_xla.get_peer_ptr %arg1, %peer_id, %metadata,
+     { argument_index = 1 : i32, world_size = 2 : i32 } :
+     (!tt.ptr<i64>, i64, !tt.ptr<i64>) -> !tt.ptr<i64>
+  
+  // Avoid optimizing away the get_peer_ptr calls, by returning xor of the two
+  // peer pointers.
+  // 
+  // CHECK-NEXT: %26 = tt.ptr_to_int %12 : !tt.ptr<i64> -> i64
+  %int_arg0 = tt.ptr_to_int %arg_0_peer_ptr : !tt.ptr<i64> -> i64
+  // CHECK-NEXT: %27 = tt.ptr_to_int %25 : !tt.ptr<i64> -> i64
+  %int_arg1 = tt.ptr_to_int %arg_1_peer_ptr : !tt.ptr<i64> -> i64
+
+  // CHECK-NEXT: %28 = arith.ori %26, %27 : i64
+  %result_int = arith.ori %int_arg0, %int_arg1 : i64
+  // CHECK-NEXT: %29 = tt.int_to_ptr %28 : i64 -> !tt.ptr<i64>
+  %result_ptr = tt.int_to_ptr %result_int : i64 -> !tt.ptr<i64>
+  // CHECK-NEXT: tt.return %29 : !tt.ptr<i64>
+  tt.return %result_ptr : !tt.ptr<i64>
 }
@@ -67,8 +67,14 @@ LogicalResult LowerGetRankOp(GetRankOp get_rank, PatternRewriter& rewriter) {
 
 // The peer address should be computed as follows:
 //
-// offset = address - metadata->buffer_root_ptrs[metadata->rank].
-// peer_address = metadata->buffer_root_ptrs[peer_id] + offset.
+// argument_offset = world_size * argument_index
+// argument_base = metadata->param_to_peers[argument_offset + metadata->rank]
+// offset = address - argument_base
+// peer_base = metadata->param_to_peers[argument_offset + peer_id]
+// peer_address = peer_base + offset
+//
+// For more details regarding peer pointers layout see comments in the:
+// `stream_executor::gpu::CollectiveKernelMetadata`.
 LogicalResult LowerGetPeerPtrOp(GetPeerPtrOp get_peer_ptr,
                                 PatternRewriter& rewriter) {
   Value metadata = get_peer_ptr.getMetadata();
@@ -94,16 +100,26 @@ LogicalResult LowerGetPeerPtrOp(GetPeerPtrOp get_peer_ptr,
   // 1. Load metadata->rank.
   Value current_rank_load_op = builder.create<GetRankOp>(metadata);
 
-  // 2. Load metadata->buffer_root_ptrs[metadata->rank].
+  // 2. Calculate argument_offset = num_ranks * argument_index.
+  const int32_t argument_index = get_peer_ptr.getArgumentIndex();
+  const int32_t world_size = get_peer_ptr.getWorldSize();
+  const int32_t argument_offset =
+      world_size * argument_index * sizeof(uint64_t);
+
+  // 3. Load metadata->param_to_peers[argument_offset + metadata->rank].
   Value local_buffers_ptrs_offset = builder.create<arith::ConstantIntOp>(
-      type_i64, offsetof(CollectiveKernelMetadata, buffer_root_ptrs));
+      type_i64, offsetof(CollectiveKernelMetadata, param_to_peers));
 
   Value rank_offset =
       builder.create<arith::ExtUIOp>(type_i64, current_rank_load_op);
+  Value argument_offset_bytes =
+      builder.create<arith::ConstantIntOp>(type_i64, argument_offset);
   Value current_rank_offset_bytes =
       builder.create<arith::MulIOp>(rank_offset, pointer_size_bytes_const);
+  Value argument_ptr_offset_bytes = builder.create<arith::AddIOp>(
+      local_buffers_ptrs_offset, argument_offset_bytes);
   Value current_ptr_offset_bytes = builder.create<arith::AddIOp>(
-      local_buffers_ptrs_offset, current_rank_offset_bytes);
+      argument_ptr_offset_bytes, current_rank_offset_bytes);
 
   Value current_range_address = builder.create<AddPtrOp>(
       metadata.getType(), metadata, current_ptr_offset_bytes);
@@ -115,19 +131,19 @@ LogicalResult LowerGetPeerPtrOp(GetPeerPtrOp get_peer_ptr,
       EvictionPolicyAttr::get(ctx, EvictionPolicy::NORMAL),
       /*isVolatile=*/rewriter.getBoolAttr(false));
 
-  // 3. Calculate offset =
-  //      address - metadata->buffer_root_ptrs[metadata->rank].
+  // 4. Calculate offset =
+  //      address - metadata->param_to_peers[argument_offset + metadata->rank].
   Value current_range_address_int =
       builder.create<PtrToIntOp>(type_i64, address);
   Value offsetInt = builder.create<arith::SubIOp>(current_range_address_int,
                                                   current_range_address_value);
 
-  // 4. Load metadata->buffer_root_ptrs[peer_id].
+  // 5. Load metadata->param_to_peers[argument_offset + peer_id].
   Value peer_index = builder.create<arith::ExtUIOp>(type_i64, peer_id);
   Value peer_index_offset_bytes =
       builder.create<arith::MulIOp>(peer_index, pointer_size_bytes_const);
   Value peer_range_offset_bytes = builder.create<arith::AddIOp>(
-      local_buffers_ptrs_offset, peer_index_offset_bytes);
+      argument_ptr_offset_bytes, peer_index_offset_bytes);
   Value peer_range_address = builder.create<AddPtrOp>(
       metadata.getType(), metadata, peer_range_offset_bytes);
 
@@ -138,7 +154,7 @@ LogicalResult LowerGetPeerPtrOp(GetPeerPtrOp get_peer_ptr,
       EvictionPolicyAttr::get(ctx, EvictionPolicy::NORMAL),
       /*isVolatile=*/rewriter.getBoolAttr(false));
 
-  // 5. Calculate the result address: peerBasePtr + offset.
+  // 6. Calculate the result address: peerBasePtr + offset.
   Value result_int =
       builder.create<arith::AddIOp>(peer_range_address_value, offsetInt);
   Value result_address = builder.create<IntToPtrOp>(result_type, result_int);
 
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/all_reduce.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <tuple>
@@ -150,15 +151,22 @@ class AllReduceKernelTest : public ::testing::Test,
     }
 
     std::vector<se::DeviceMemoryBase> metadata_buffers;
+    // One for signal and one for input parameters.
+    constexpr int kNumPeerParameters = 2;
+    size_t param_to_peers_size =
+        sizeof(uint64_t) * kNumPeerParameters * num_ranks;
+    std::vector<uint64_t> param_to_peers_ptrs;
+    for (const auto& local_input_buffer : local_input_buffers) {
+      param_to_peers_ptrs.push_back((uint64_t)local_input_buffer.opaque());
+    }
+    for (const auto& signal_flags_buffer : signal_flags_buffers) {
+      param_to_peers_ptrs.push_back((uint64_t)signal_flags_buffer.opaque());
+    }
 
     for (int i = 0; i < num_ranks; ++i) {
       CollectiveKernelMetadata metadata;
       metadata.rank = i;
 
-      for (int j = 0; j < num_ranks; ++j) {
-        metadata.buffer_root_ptrs[j] = (uint64_t)allocated_buffers[j].opaque();
-      }
-
       if (params_.all_reduce_strategy == AllReduceStrategy::kMultimem) {
         stream_executor::gpu::GpuExecutor* gpu_executor =
             dynamic_cast<stream_executor::gpu::GpuExecutor*>(executors[i]);
@@ -171,11 +179,21 @@ class AllReduceKernelTest : public ::testing::Test,
         metadata.multicast_buffer_ptr = 0;
       }
 
+      // First map from parameter to peer ptrs and then metadata.
       metadata_buffers.emplace_back(executors[i]->AllocateArray<uint64_t>(
-          sizeof(CollectiveKernelMetadata)));
+          sizeof(CollectiveKernelMetadata) + param_to_peers_size));
+
+      se::DeviceMemoryBase param_to_peers_ptrs_buffer =
+          metadata_buffers[i].GetByteSlice(sizeof(CollectiveKernelMetadata),
+                                           param_to_peers_size);
+      metadata.param_to_peers =
+          reinterpret_cast<uint64_t*>(param_to_peers_ptrs_buffer.opaque());
 
       TF_RETURN_IF_ERROR(streams[i]->Memcpy(&metadata_buffers[i], &metadata,
                                             sizeof(CollectiveKernelMetadata)));
+      TF_RETURN_IF_ERROR(streams[i]->Memcpy(&param_to_peers_ptrs_buffer,
+                                            param_to_peers_ptrs.data(),
+                                            param_to_peers_size));
     }
 
     for (int i = 0; i < num_ranks; ++i) {