intel
diff --git a/‎third_party/openxla.patch
Lines changed: 85 additions & 37 deletions b/‎third_party/openxla.patch
Lines changed: 85 additions & 37 deletions
diff --git a/‎xla/service/gpu/onednn_gpu_conv_runner.cc
Lines changed: 27 additions & 21 deletions b/‎xla/service/gpu/onednn_gpu_conv_runner.cc
Lines changed: 27 additions & 21 deletions
diff --git a/‎xla/service/gpu/onednn_gpu_conv_runner.h
Lines changed: 0 additions & 1 deletion b/‎xla/service/gpu/onednn_gpu_conv_runner.h
Lines changed: 0 additions & 1 deletion
@@ -3232,7 +3232,7 @@ index c888aff30..0fd7254d9 100644
    // Extract the memory value returned from atomicCAS and store it as
    // cas_old_output.
 diff --git a/xla/service/gpu/ir_emitter_unnested.cc b/xla/service/gpu/ir_emitter_unnested.cc
-index 86426953a..620f092c7 100644
+index 86426953a..afd144957 100644
 --- a/xla/service/gpu/ir_emitter_unnested.cc
 +++ b/xla/service/gpu/ir_emitter_unnested.cc
@@ -1,4 +1,6 @@
@@ -3852,7 +3852,7 @@ index 86426953a..620f092c7 100644
  static absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
      mlir::DictionaryAttr dict) {
    CustomCallThunk::AttributesMap attributes;
-@@ -1314,6 +1337,103 @@ static absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
+@@ -1314,6 +1337,106 @@ static absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
    }
    return attributes;
  }
@@ -3862,9 +3862,17 @@ index 86426953a..620f092c7 100644
 +  // After 0.4.26, ffi support absl::span.
 +  // Below attrs can be refine to absl::span for reducing key-value
 +  CustomCallThunk::AttributesMap attrs;
-+  attrs["backend_config_str"] = instr->raw_backend_config_string();
 +  if (IsCustomCallToDnnConvolution(*instr)) {
++    TF_ASSIGN_OR_RETURN(auto gpu_config,
++                        instr->backend_config<GpuBackendConfig>());
++    const CudnnConvBackendConfig& backend_config =
++        gpu_config.cudnn_conv_backend_config();
 +    TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(instr));
++    attrs["conv_result_scale"] =  static_cast<float>(backend_config.conv_result_scale());
++    attrs["side_input_scale"] =  static_cast<float>(backend_config.side_input_scale());
++    attrs["activation_mode"] =  static_cast<int32_t>(backend_config.activation_mode());
++    attrs["leakyrelu_alpha"] =  static_cast<float>(backend_config.leakyrelu_alpha());
++
 +    const Window& window = instr->window();
 +    const ConvolutionDimensionNumbers& dnums =
 +        instr->convolution_dimension_numbers();
@@ -3930,23 +3938,18 @@ index 86426953a..620f092c7 100644
 +    attrs["filter_dl"] = static_cast<int32_t>(filter_dl);
 +    attrs["output_dl"] = static_cast<int32_t>(output_dl);
 +  } else if (IsLegacyCublasMatmul(*instr) || IsCublasLtMatmul(*instr)) {
-+    const Shape& lhs_shape = instr->operand(0)->shape();
-+    const Shape& rhs_shape = instr->operand(1)->shape();
-+    const Shape& output_shape = instr->shape().IsTuple()
-+                                    ? instr->shape().tuple_shapes(0)
-+                                    : instr->shape();
-+    for (int i = 0; i < lhs_shape.layout().minor_to_major().size(); ++i) {
-+      attrs["lhs_minor_to_major_" + std::to_string(i)] =
-+          lhs_shape.layout().minor_to_major()[i];
-+    }
-+    for (int i = 0; i < rhs_shape.layout().minor_to_major().size(); ++i) {
-+      attrs["rhs_minor_to_major_" + std::to_string(i)] =
-+          rhs_shape.layout().minor_to_major()[i];
-+    }
-+    for (int i = 0; i < output_shape.layout().minor_to_major().size(); ++i) {
-+      attrs["output_minor_to_major_" + std::to_string(i)] =
-+          output_shape.layout().minor_to_major()[i];
-+    }
++    TF_ASSIGN_OR_RETURN(const auto gpu_config,
++                        instr->backend_config<xla::gpu::GpuBackendConfig>());
++    xla::gpu::GemmBackendConfig config = gpu_config.gemm_backend_config();
++    xla::gpu::GemmBackendConfig_Epilogue epilogue = config.epilogue();
++    TF_ASSIGN_OR_RETURN(
++      auto gemm_config,
++      GemmConfig::For(static_cast<const HloInstruction*>(instr)));
++    GemmConfig* gemm_config_ptr = new GemmConfig(gemm_config);
++    attrs["epilogue"] = static_cast<int32_t>(epilogue);
++    // SYCL TODO:
++    // gemm_config may be split into separate parameters and added to attrs later.
++    attrs["gemm_config_ptr"] = reinterpret_cast<int64_t>(gemm_config_ptr);
 +  } else {
 +    return absl::InternalError("Unknown CustomCall To SYCL FFI Call");
 +  }
@@ -3956,15 +3959,15 @@ index 86426953a..620f092c7 100644
 
  absl::Status IrEmitterUnnested::EmitCustomCallThunk(
      const HloCustomCallInstruction* instr) {
-@@ -1433,6 +1553,7 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
+@@ -1433,6 +1556,7 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
    }
 
    auto& backend_config_str = instr->raw_backend_config_string();
 +
    switch (instr->api_version()) {
      case CustomCallApiVersion::API_VERSION_ORIGINAL:
      case CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
-@@ -1443,6 +1564,12 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
+@@ -1443,6 +1567,12 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
        break;
 
      case CustomCallApiVersion::API_VERSION_TYPED_FFI:
@@ -3977,7 +3980,7 @@ index 86426953a..620f092c7 100644
        if (!backend_config_str.empty()) {
          mlir::Attribute attr = mlir::parseAttribute(
              backend_config_str, ir_emitter_context_->mlir_context());
-@@ -1455,7 +1582,7 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
+@@ -1455,7 +1585,7 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
              "dictionary attribute");
        }
        break;
@@ -3986,7 +3989,7 @@ index 86426953a..620f092c7 100644
      default:
        return Internal("Unknown custom-call API version enum value: %d",
                        instr->api_version());
-@@ -1496,7 +1623,7 @@ absl::Status IrEmitterUnnested::EmitFftThunk(const HloFftInstruction* instr) {
+@@ -1496,7 +1626,7 @@ absl::Status IrEmitterUnnested::EmitFftThunk(const HloFftInstruction* instr) {
    return absl::OkStatus();
  }
 
@@ -3995,7 +3998,7 @@ index 86426953a..620f092c7 100644
 
  absl::Status IrEmitterUnnested::EmitTriangularSolveCustomCall(
      const HloInstruction* instr) {
-@@ -1576,7 +1703,7 @@ absl::Status IrEmitterUnnested::EmitTriangularSolveCustomCall(
+@@ -1576,7 +1706,7 @@ absl::Status IrEmitterUnnested::EmitTriangularSolveCustomCall(
    }
    return absl::OkStatus();
  }
@@ -4004,7 +4007,7 @@ index 86426953a..620f092c7 100644
 
  absl::Status IrEmitterUnnested::EmitTopKCustomCall(
      const HloCustomCallInstruction* instr) {
-@@ -2602,33 +2729,33 @@ absl::Status IrEmitterUnnested::EmitCopyStartThunk(
+@@ -2602,33 +2732,33 @@ absl::Status IrEmitterUnnested::EmitCopyStartThunk(
  }
 
  absl::Status IrEmitterUnnested::EmitSendThunk(const HloSendInstruction* instr) {
@@ -4064,7 +4067,7 @@ index 86426953a..620f092c7 100644
 
    return absl::OkStatus();
  }
-@@ -2650,33 +2777,33 @@ absl::Status IrEmitterUnnested::EmitSendDoneThunk(
+@@ -2650,33 +2780,33 @@ absl::Status IrEmitterUnnested::EmitSendDoneThunk(
  }
 
  absl::Status IrEmitterUnnested::EmitRecvThunk(const HloRecvInstruction* instr) {
@@ -4124,7 +4127,7 @@ index 86426953a..620f092c7 100644
 
    return absl::OkStatus();
  }
-@@ -2798,13 +2925,31 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
+@@ -2798,13 +2928,31 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
      case HloOpcode::kCustomCall: {
        auto* custom_call = Cast<HloCustomCallInstruction>(instr);
        if (IsLegacyCublasMatmul(*instr)) {
@@ -4158,7 +4161,7 @@ index 86426953a..620f092c7 100644
  #if GOOGLE_CUDA
        if (IsCublasLtMatmulF8(*instr)) {
          return EmitCublasLtMatmulThunkF8(custom_call);
-@@ -2815,30 +2960,32 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
+@@ -2815,30 +2963,32 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
        if (IsCustomCallToDnnNorm(*instr)) {
          return EmitNormThunk(custom_call);
        }
@@ -4753,10 +4756,18 @@ index 15e6e6692..db8ebb0e1 100644
  }  // namespace xla::gpu
 
 diff --git a/xla/service/gpu/runtime/BUILD b/xla/service/gpu/runtime/BUILD
-index 4f298bbe0..39b81b2c7 100644
+index 4f298bbe0..1a45c9a12 100644
 --- a/xla/service/gpu/runtime/BUILD
 +++ b/xla/service/gpu/runtime/BUILD
-@@ -427,6 +427,7 @@ cc_library(
+@@ -381,6 +381,7 @@ cc_library(
+         "//xla/service:custom_call_status",
+         "//xla/service:custom_call_status_internal",
+         "//xla/service:executable",
++        "//xla/service/gpu:matmul_utils",
+         "//xla/service/gpu:thunk",
+         "//xla/stream_executor:device_memory",
+         "//xla/stream_executor/gpu:gpu_stream_header",
+@@ -427,6 +428,7 @@ cc_library(
          "//xla/service/gpu:thunk",
          "//xla/stream_executor",
          "@com_google_absl//absl/container:flat_hash_map",
@@ -4922,10 +4933,17 @@ index 02aecd464..df9213bae 100644
    absl::flat_hash_map<const stream_executor::Stream*,
                        std::unique_ptr<GenericConvRunner>>
 diff --git a/xla/service/gpu/runtime/custom_call_thunk.cc b/xla/service/gpu/runtime/custom_call_thunk.cc
-index 28a7dcebf..4c8727f6b 100644
+index 28a7dcebf..faaa07689 100644
 --- a/xla/service/gpu/runtime/custom_call_thunk.cc
 +++ b/xla/service/gpu/runtime/custom_call_thunk.cc
-@@ -36,7 +36,7 @@ limitations under the License.
+@@ -30,13 +30,14 @@ limitations under the License.
+ #include "xla/service/buffer_assignment.h"
+ #include "xla/service/custom_call_status.h"
+ #include "xla/service/custom_call_status_internal.h"
++#include "xla/service/gpu/matmul_utils.h"
+ #include "xla/service/gpu/thunk.h"
+ #include "xla/service/service_executable_run_options.h"
+ #include "xla/status.h"
  #include "xla/stream_executor/device_memory.h"
  #include "xla/util.h"
 
@@ -4934,7 +4952,26 @@ index 28a7dcebf..4c8727f6b 100644
  #include "xla/stream_executor/gpu/gpu_stream.h"
  #endif
 
-@@ -89,7 +89,7 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
+@@ -70,6 +71,18 @@ CustomCallThunk::CustomCallThunk(ThunkInfo thunk_info, XLA_FFI_Handler* handler,
+       attributes_(std::move(attributes)),
+       called_computation_(called_computation) {}
+ 
++#ifdef TENSORFLOW_USE_SYCL
++  CustomCallThunk::~CustomCallThunk(){
++    if(attributes_.find("gemm_config_ptr") != attributes_.end()){
++      GemmConfig* gemm_config_ptr = 
++        reinterpret_cast<GemmConfig*>(std::get<int64_t>(attributes_["gemm_config_ptr"]));
++      if(gemm_config_ptr != nullptr){
++        delete gemm_config_ptr;
++      }
++    }
++  }
++#endif
++
+ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
+   // gpu_stream is CUstream or e.g. the equivalent type in ROCm.
+   std::vector<void*> buffers;
+@@ -89,7 +102,7 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
      }
    }
 
@@ -4943,7 +4980,7 @@ index 28a7dcebf..4c8727f6b 100644
    auto gpu_stream = se::gpu::AsGpuStreamValue(params.stream);
    XlaCustomCallStatus custom_call_status;
    call_target_(gpu_stream, buffers.data(), opaque_.data(), opaque_.size(),
-@@ -100,11 +100,11 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
+@@ -100,11 +113,11 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
    } else {
      return absl::OkStatus();
    }
@@ -4957,7 +4994,7 @@ index 28a7dcebf..4c8727f6b 100644
  }
 
  absl::Status CustomCallThunk::ExecuteFfiHandler(const ExecuteParams& params) {
-@@ -139,6 +139,10 @@ absl::Status CustomCallThunk::ExecuteFfiHandler(const ExecuteParams& params) {
+@@ -139,6 +152,10 @@ absl::Status CustomCallThunk::ExecuteFfiHandler(const ExecuteParams& params) {
    // execution context, as apparently it's not easily accessible from Thunk.
    ExecutableRunOptions run_options;
    run_options.set_stream(params.stream);
@@ -4969,7 +5006,7 @@ index 28a7dcebf..4c8727f6b 100644
 
    CallOptions options = {&service_run_options, called_computation_};
 diff --git a/xla/service/gpu/runtime/custom_call_thunk.h b/xla/service/gpu/runtime/custom_call_thunk.h
-index 5fa1dce32..e75b61636 100644
+index 5fa1dce32..03a61bd0c 100644
 --- a/xla/service/gpu/runtime/custom_call_thunk.h
 +++ b/xla/service/gpu/runtime/custom_call_thunk.h
@@ -35,7 +35,7 @@ limitations under the License.
@@ -4996,6 +5033,17 @@ index 5fa1dce32..e75b61636 100644
 
    using CustomCallTarget = std::function<void(Stream, void**, const char*,
                                                size_t, XlaCustomCallStatus*)>;
+@@ -94,6 +94,10 @@ class CustomCallThunk : public Thunk {
+   const std::vector<std::optional<Slice>>& results() const { return results_; }
+   absl::string_view opaque() const { return opaque_; }
+ 
++#ifdef TENSORFLOW_USE_SYCL
++  ~CustomCallThunk();
++#endif
++
+  private:
+   absl::Status ExecuteCustomCall(const ExecuteParams& params);
+   absl::Status ExecuteFfiHandler(const ExecuteParams& params);
 diff --git a/xla/service/gpu/runtime/fft_thunk.cc b/xla/service/gpu/runtime/fft_thunk.cc
 index 728c36752..fccde5793 100644
 --- a/xla/service/gpu/runtime/fft_thunk.cc
 
@@ -66,7 +66,6 @@ int64_t GetVectCSize(FilterLayout layout) {
 absl::Status CreateOneDnnPrimitive(
     OneDnnConvPrimitive* onednn_primitive,  // NOLINT
     const ffi::Dictionary& dict,
-    absl::flat_hash_map<std::string, std::string>& backend_dict,
     absl::Span<const ffi::BufferBase> operand_buffers,
     ffi::BufferBase result_buffer, se::Stream* stream,
     se::ScratchAllocator* scratch_allocator, CudnnConvKind conv_kind) {
@@ -89,7 +88,7 @@ absl::Status CreateOneDnnPrimitive(
   void* bias_data = nullptr;
   void* side_input_data = nullptr;
 
-  float conv_result_scale = std::stof(backend_dict["conv_result_scale"]);
+  float conv_result_scale = *dict.get<float>("conv_result_scale");
   bool conv_result_scale_one = (fabs(conv_result_scale - 1.0f) < 1e-6);
 
   switch (conv_kind) {
@@ -139,7 +138,7 @@ absl::Status CreateOneDnnPrimitive(
     bias_data = const_cast<void*>(operand_buffers[2].data.opaque());
     if (operand_buffers.size() >= 4) {
       side_input_data = const_cast<void*>(operand_buffers[3].data.opaque());
-      side_input_scale = std::stof(backend_dict["side_input_scale"]);
+      side_input_scale = *dict.get<float>("side_input_scale");
       side_input_scale_zero = (fabs(side_input_scale - 0.0f) < 1e-6);
     }
   }
@@ -457,22 +456,30 @@ absl::Status CreateOneDnnPrimitive(
            onednn_primitive->bias_memory});
     }
     if (conv_kind == CudnnConvKind::kForwardActivation) {
-      if (backend_dict["activation_mode"] == "kNone") {
-      } else if (backend_dict["activation_mode"] == "kSigmoid") {
-        po.append_eltwise(dnnl::algorithm::eltwise_logistic, 1, 0);
-      } else if (backend_dict["activation_mode"] == "kRelu") {
-        po.append_eltwise(dnnl::algorithm::eltwise_relu, 0, 0);
-      } else if (backend_dict["activation_mode"] == "kRelu6") {
-        po.append_eltwise(dnnl::algorithm::eltwise_clip_v2, 0, 6);
-      } else if (backend_dict["activation_mode"] == "kTanh") {
-        po.append_eltwise(dnnl::algorithm::eltwise_tanh, 0, 0);
-      } else if (backend_dict["activation_mode"] == "kElu") {
-        po.append_eltwise(dnnl::algorithm::eltwise_elu, 1, 0);
-      } else if (backend_dict["activation_mode"] == "kLeakyRelu") {
-        float leakyrelu_alpha = std::stof(backend_dict["leakyrelu_alpha"]);
-        po.append_eltwise(dnnl::algorithm::eltwise_relu, leakyrelu_alpha, 0);
-      } else {
-        return Internal("Unsupported Activation mode");
+      auto activation_mode = static_cast<stream_executor::dnn::ActivationMode>(*dict.get<int32_t>("activation_mode"));
+      switch (activation_mode) {
+        case stream_executor::dnn::kSigmoid:
+          po.append_eltwise(dnnl::algorithm::eltwise_logistic, 1, 0);
+          break;
+        case stream_executor::dnn::kRelu:
+          po.append_eltwise(dnnl::algorithm::eltwise_relu, 0, 0);
+          break;
+        case stream_executor::dnn::kRelu6:
+          po.append_eltwise(dnnl::algorithm::eltwise_clip_v2, 0, 6);
+          break;
+        case stream_executor::dnn::kTanh:
+          po.append_eltwise(dnnl::algorithm::eltwise_tanh, 0, 0);
+          break;
+        case stream_executor::dnn::kElu:
+          po.append_eltwise(dnnl::algorithm::eltwise_elu, 1, 0);
+          break;
+        case stream_executor::dnn::kLeakyRelu:
+          po.append_eltwise(dnnl::algorithm::eltwise_relu, *dict.get<float>("leakyrelu_alpha"), 0);
+          break;
+        case stream_executor::dnn::kNone:
+          break;
+        default:
+          return Internal("Unsupported Activation mode");
       }
     }
     post_ops_attr.set_post_ops(po);
@@ -673,12 +680,11 @@ absl::Status CreateOneDnnPrimitive(
 
 absl::StatusOr<OneDnnConvPrimitive> GetOrCreateOneDnnConvPrimitive(
     se::Stream* stream, const ffi::Dictionary& dict,
-    absl::flat_hash_map<std::string, std::string>& backend_dict,
     const std::vector<ffi::BufferBase>& operand_se_buffers,
     const ffi::BufferBase& result_buffer,
     se::ScratchAllocator* scratch_allocator, CudnnConvKind conv_kind) {
   OneDnnConvPrimitive primitive;
-  auto status = CreateOneDnnPrimitive(&primitive, dict, backend_dict,
+  auto status = CreateOneDnnPrimitive(&primitive, dict,
                                       absl::MakeSpan(operand_se_buffers),
                                       result_buffer, stream, scratch_allocator,
                                       conv_kind);
 
@@ -53,7 +53,6 @@ typedef struct OneDnnConvPrimitive {
 
 absl::StatusOr<OneDnnConvPrimitive> GetOrCreateOneDnnConvPrimitive(
     se::Stream*, const ffi::Dictionary& dict,
-    absl::flat_hash_map<std::string, std::string>& backend_dict,
     const std::vector<ffi::BufferBase>& operand_se_buffers,
     const ffi::BufferBase& result_buffer,
     se::ScratchAllocator* scratch_allocator, CudnnConvKind conv_kind);