intel
diff --git a/‎third_party/openxla.patch
Lines changed: 14 additions & 2 deletions b/‎third_party/openxla.patch
Lines changed: 14 additions & 2 deletions
diff --git a/‎xla/service/gpu/BUILD
Lines changed: 9 additions & 16 deletions b/‎xla/service/gpu/BUILD
Lines changed: 9 additions & 16 deletions
diff --git a/‎xla/service/gpu/onednn_gpu_conv_runner.cc
Lines changed: 44 additions & 13 deletions b/‎xla/service/gpu/onednn_gpu_conv_runner.cc
Lines changed: 44 additions & 13 deletions
diff --git a/‎xla/service/gpu/onednn_gpu_conv_runner.h
Lines changed: 6 additions & 38 deletions b/‎xla/service/gpu/onednn_gpu_conv_runner.h
Lines changed: 6 additions & 38 deletions
@@ -618,7 +618,7 @@ index 8acdeb102..50c7efe11 100644
 
   protected:
 diff --git a/xla/service/computation_placer.cc b/xla/service/computation_placer.cc
-index b896c7d10..02c5a642a 100644
+index b896c7d10..bd5dcea2b 100644
 --- a/xla/service/computation_placer.cc
 +++ b/xla/service/computation_placer.cc
@@ -31,6 +31,7 @@ limitations under the License.
@@ -629,7 +629,19 @@ index b896c7d10..02c5a642a 100644
  #include "xla/types.h"
  #include "xla/util.h"
  #include "tsl/platform/errors.h"
-@@ -216,6 +217,8 @@ static bool InitModule() {
+@@ -164,6 +165,11 @@ absl::StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
+   absl::MutexLock lock(&ComputationPlacer::platform_computation_placer_mutex_);
+   auto* computation_placers = GetPlatformComputationPlacers();
+   if (computation_placers->find(platform_id) != computation_placers->end()) {
++    // FIXME(intel): Temporarily skip the registry to avoid linking warning.
++    // Will reopen this check once refine oneDNN custom call code.
++#ifdef TENSORFLOW_USE_SYCL
++    return;
++#endif
+     // TODO(b/282059652): Consider logging the platform name using
+     // PlatformManager::PlatformWithId(). No doing that for now to avoid
+     // introducing unwanted dependency.
+@@ -216,6 +222,8 @@ static bool InitModule() {
        stream_executor::cuda::kCudaPlatformId, &CreateComputationPlacer);
    xla::ComputationPlacer::RegisterComputationPlacer(
        stream_executor::rocm::kROCmPlatformId, &CreateComputationPlacer);
 
@@ -28,13 +28,10 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        "//xla/service:onednn_util",
-        "//xla/service/gpu:sycl_onednn",
-        "@xla//xla/ffi",
+        ":sycl_onednn_header",
+        "@xla//xla/ffi:ffi",
         "@xla//xla/ffi:ffi_api",
-        "@xla//xla/stream_executor",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
@@ -56,7 +53,7 @@ cc_library(
     srcs = ["gemm_impl_picker.cc",],
     hdrs = ["gemm_impl_picker.h"],
     deps = [
-        ":sycl_onednn",
+        ":sycl_onednn_header",
         "//xla/stream_executor/sycl:hw_info",
         "@com_google_absl//absl/algorithm:container",
         "@tsl//tsl/platform:errors",
@@ -72,7 +69,6 @@ cc_library(
         "@xla//xla/stream_executor",
         "@xla//xla/stream_executor:device_description",
         "@xla//xla/stream_executor:device_memory_allocator",
-        "@xla//xla/stream_executor/gpu:gpu_stream",
         "@xla//xla/stream_executor/gpu:gpu_timer",
         "@xla//xla/service/gpu:ir_emission_utils",
         "@xla//xla/service/gpu:matmul_utils",
@@ -178,7 +174,7 @@ xpu_library(
 )
 
 cc_import(
-    name = "sycl_onednn",
+    name = "sycl_onednn_header",
     hdrs = [
         "sycl_onednn.h",
         "onednn_gpu_conv_runner.h",
@@ -188,9 +184,8 @@ cc_import(
     visibility = ["//visibility:public"],
     deps = [
         ":scratch_allocator",
-        "@xla//xla/service/gpu:gpu_conv_runner",
-        "@xla//xla/service/gpu:thunk",
         "@xla//xla/service/gpu:matmul_utils",
+        "@xla//xla/stream_executor/gpu:gpu_stream",
     ],
 )
 
@@ -216,11 +211,9 @@ cc_library(
     deps = [
         ":scratch_allocator",
         "//xla/service:onednn_util",
-        "@xla//xla/ffi",
-        "@xla//xla/ffi:ffi_api",
-        "@xla//xla/service/gpu:gpu_conv_runner",
-        "@xla//xla/service/gpu:stream_executor_util",
-        "@xla//xla/service/gpu:thunk",
+        "@xla//xla/ffi:ffi",
+        "@xla//xla/service/gpu:cublas_cudnn",
+        "@xla//xla/stream_executor/gpu:gpu_stream",
     ],
 )
 
@@ -317,4 +310,4 @@ cc_library(
         "@xla//xla/service:hlo_pass",
         "@xla//xla/service:pattern_matcher",
     ],
-)
+)
@@ -18,10 +18,11 @@ limitations under the License.
 #include <string>
 
 #include "xla/service/gpu/scratch_allocator.h"
-#include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/onednn_util.h"
 
 namespace xla {
 namespace gpu {
+
 using se::DeviceMemory;
 using se::DeviceMemoryBase;
 using se::Stream;
@@ -39,6 +40,29 @@ using ConvBwdInputPd = dnnl::convolution_backward_data::primitive_desc;
 using ConvBwdFilterPd = dnnl::convolution_backward_weights::primitive_desc;
 using ConvBwdFilterPrimitive = dnnl::convolution_backward_weights;
 
+typedef struct OneDnnConvPrimitive {
+  dnnl::memory src_memory;
+  dnnl::memory filter_memory;
+  dnnl::memory dst_memory;
+  dnnl::memory internal_filter_memory;
+  dnnl::memory scratchpad_memory;
+  dnnl::memory bias_memory;
+  dnnl::convolution_forward fwd_primitive;
+  dnnl::convolution_backward_data bwd_input_primitive;
+  dnnl::convolution_backward_weights bwd_filter_primitive;
+  dnnl::reorder filter_reorder_primitive;
+
+  std::unordered_map<int, dnnl::memory> fwd_primitives_args;
+  std::unordered_map<int, dnnl::memory> bwd_input_primitive_args;
+  std::unordered_map<int, dnnl::memory> bwd_filter_primitive_args;
+
+  std::unordered_map<int, dnnl::memory> reorder_args;
+
+  dnnl::engine engine;
+  dnnl::stream stream;
+  bool has_reorder = false;
+} OneDnnConvPrimitive;
+
 namespace {
 
 int64_t GetVectCSize(DataLayout layout) {
@@ -67,7 +91,7 @@ absl::Status CreateOneDnnPrimitive(
     OneDnnConvPrimitive* onednn_primitive,  // NOLINT
     const ffi::Dictionary& dict,
     absl::Span<const ffi::BufferBase> operand_buffers,
-    ffi::BufferBase result_buffer, se::Stream* stream,
+    const ffi::BufferBase& result_buffer, se::Stream* stream,
     se::ScratchAllocator* scratch_allocator, CudnnConvKind conv_kind) {
   sycl::queue* dpcpp_stream = se::gpu::AsGpuStreamValue(stream);
   onednn_primitive->engine = FindOrCreateEngine(dpcpp_stream);
@@ -456,7 +480,8 @@ absl::Status CreateOneDnnPrimitive(
            onednn_primitive->bias_memory});
     }
     if (conv_kind == CudnnConvKind::kForwardActivation) {
-      auto activation_mode = static_cast<stream_executor::dnn::ActivationMode>(*dict.get<int32_t>("activation_mode"));
+      auto activation_mode = static_cast<stream_executor::dnn::ActivationMode>(
+          *dict.get<int32_t>("activation_mode"));
       switch (activation_mode) {
         case stream_executor::dnn::kSigmoid:
           po.append_eltwise(dnnl::algorithm::eltwise_logistic, 1, 0);
@@ -474,7 +499,8 @@ absl::Status CreateOneDnnPrimitive(
           po.append_eltwise(dnnl::algorithm::eltwise_elu, 1, 0);
           break;
         case stream_executor::dnn::kLeakyRelu:
-          po.append_eltwise(dnnl::algorithm::eltwise_relu, *dict.get<float>("leakyrelu_alpha"), 0);
+          po.append_eltwise(dnnl::algorithm::eltwise_relu,
+                            *dict.get<float>("leakyrelu_alpha"), 0);
           break;
         case stream_executor::dnn::kNone:
           break;
@@ -680,30 +706,35 @@ absl::Status CreateOneDnnPrimitive(
 
 absl::StatusOr<OneDnnConvPrimitive> GetOrCreateOneDnnConvPrimitive(
     se::Stream* stream, const ffi::Dictionary& dict,
-    const std::vector<ffi::BufferBase>& operand_se_buffers,
+    absl::Span<const ffi::BufferBase> operand_buffers,
     const ffi::BufferBase& result_buffer,
     se::ScratchAllocator* scratch_allocator, CudnnConvKind conv_kind) {
   OneDnnConvPrimitive primitive;
-  auto status = CreateOneDnnPrimitive(&primitive, dict,
-                                      absl::MakeSpan(operand_se_buffers),
-                                      result_buffer, stream, scratch_allocator,
-                                      conv_kind);
+  auto status =
+      CreateOneDnnPrimitive(&primitive, dict, operand_buffers, result_buffer,
+                            stream, scratch_allocator, conv_kind);
   if (TF_PREDICT_FALSE(!status.ok())) {
     return status;
   }
   return primitive;
 }
 
-absl::Status RunGpuConv(const OneDnnConvPrimitive& onednn_primitive,
-                        const ffi::Dictionary& dict,
+absl::Status RunGpuConv(se::Stream* stream, const ffi::Dictionary& dict,
                         absl::Span<const ffi::BufferBase> operand_buffers,
-                        ffi::BufferBase result_buffer, CudnnConvKind conv_kind) {
+                        ffi::BufferBase& result_buffer,
+                        se::ScratchAllocator* allocator,
+                        CudnnConvKind conv_kind) {
   void* input_data;
   void* filter_data;
   void* output_data;
   void* bias_data = nullptr;
   void* side_input_data = nullptr;
 
+  TF_ASSIGN_OR_RETURN(
+      auto onednn_primitive,
+      GetOrCreateOneDnnConvPrimitive(stream, dict, operand_buffers,
+                                     result_buffer, allocator, conv_kind));
+
   switch (conv_kind) {
     case CudnnConvKind::kForward:
     case CudnnConvKind::kForwardActivation:
@@ -776,4 +807,4 @@ absl::Status RunGpuConv(const OneDnnConvPrimitive& onednn_primitive,
 }
 
 }  // namespace gpu
-}  // namespace xla
+}  // namespace xla
@@ -19,50 +19,18 @@ limitations under the License.
 #include <optional>
 
 #include "xla/ffi/ffi.h"
-#include "xla/ffi/ffi_api.h"
-#include "xla/service/gpu/gpu_conv_runner.h"
-#include "xla/service/gpu/thunk.h"
-#include "xla/service/onednn_util.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
 
 namespace xla {
 
 namespace gpu {
 
-typedef struct OneDnnConvPrimitive {
-  dnnl::memory src_memory;
-  dnnl::memory filter_memory;
-  dnnl::memory dst_memory;
-  dnnl::memory internal_filter_memory;
-  dnnl::memory scratchpad_memory;
-  dnnl::memory bias_memory;
-  dnnl::convolution_forward fwd_primitive;
-  dnnl::convolution_backward_data bwd_input_primitive;
-  dnnl::convolution_backward_weights bwd_filter_primitive;
-  dnnl::reorder filter_reorder_primitive;
-
-  std::unordered_map<int, dnnl::memory> fwd_primitives_args;
-  std::unordered_map<int, dnnl::memory> bwd_input_primitive_args;
-  std::unordered_map<int, dnnl::memory> bwd_filter_primitive_args;
-
-  std::unordered_map<int, dnnl::memory> reorder_args;
-
-  dnnl::engine engine;
-  dnnl::stream stream;
-  bool has_reorder = false;
-} OneDnnConvPrimitive;
-
-absl::StatusOr<OneDnnConvPrimitive> GetOrCreateOneDnnConvPrimitive(
-    se::Stream*, const ffi::Dictionary& dict,
-    const std::vector<ffi::BufferBase>& operand_se_buffers,
-    const ffi::BufferBase& result_buffer,
-    se::ScratchAllocator* scratch_allocator, CudnnConvKind conv_kind);
-
-absl::Status RunGpuConv(const OneDnnConvPrimitive& onednn_primitive,
-                        const ffi::Dictionary& dict,
-                        absl::Span<const ffi::BufferBase> operand_buffers,
-                        ffi::BufferBase result_buffer, CudnnConvKind conv_kind);
+absl::Status RunGpuConv(se::Stream*, const ffi::Dictionary&,
+                        absl::Span<const ffi::BufferBase>, ffi::BufferBase&,
+                        se::ScratchAllocator*, CudnnConvKind);
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_ONEDNN_GPU_CONV_RUNNER_H_
+#endif  // XLA_SERVICE_GPU_ONEDNN_GPU_CONV_RUNNER_H_