From b0b89af1feb0bc4e369d3150279d90d530cd8933 Mon Sep 17 00:00:00 2001
From: weihanmines <weihan13@amd.com>
Date: Wed, 19 Jan 2022 02:43:38 +0000
Subject: [PATCH 1/3] rocblas(mlir_tests) passed unit tests

---
 backends/gpu/lib/kernels/blas_kernels.cc      |  33 +--
 backends/gpu/mlir_tests/rocm/BUILD            |  30 +++
 backends/gpu/mlir_tests/rocm/blas.mlir        | 193 ++++++++++++++++++
 backends/gpu/tools/stub_codegen/hip.json      |   5 +-
 backends/gpu/tools/stub_codegen/hipfft.json   |   4 +-
 backends/gpu/tools/stub_codegen/miopen.json   |   4 +-
 backends/gpu/tools/stub_codegen/rocblas.json  |   4 +-
 .../gpu/tools/stub_codegen/rocsolver.json     |   4 +-
 8 files changed, 251 insertions(+), 26 deletions(-)
 create mode 100644 backends/gpu/mlir_tests/rocm/BUILD
 create mode 100644 backends/gpu/mlir_tests/rocm/blas.mlir
diff --git a/backends/gpu/lib/kernels/blas_kernels.cc b/backends/gpu/lib/kernels/blas_kernels.cc
index 042cb3713d2..29f2e072885 100644
--- a/backends/gpu/lib/kernels/blas_kernels.cc
+++ b/backends/gpu/lib/kernels/blas_kernels.cc
@@ -178,11 +178,11 @@ static Error BlasTrsmBatch(
   // TODO(hanbinyoon): Also support the ROCm function corresponding to
   // cublas<t>trsmBatched.
   auto platform = handle->platform();
-  if (platform != wrapper::Platform::CUDA)
+  if (platform != wrapper::Platform::CUDA && platform != wrapper::Platform::ROCm)
     return MakeStringError("Unsupported platform ", platform);
 
-  cudaDataType data_type = wrapper::BlasDataType::FromOpaqueValue(*dataType);
-  auto alpha_ptr = GetScalePointer(alpha, data_type);
+  rocblas_datatype data_type = wrapper::BlasDataType::FromOpaqueValue(*dataType);
+  auto alpha_ptr = GetScalePointer(alpha, static_cast<wrapper::BlasDataType>(data_type));
   if (!alpha_ptr) return alpha_ptr.takeError();
 
   auto call = [&](auto dummy) {
@@ -200,7 +200,7 @@ static Error BlasTrsmBatch(
     const T** a_array = const_cast<const T**>(b_array + batchCount);
 
     auto side_mode = wrapper::BlasSideMode::FromOpaqueValue(*sideMode);
-    ptrdiff_t a_batch_stride = side_mode == CUBLAS_SIDE_LEFT ? m * m : n * n;
+    ptrdiff_t a_batch_stride = (side_mode == rocblas_side_left) ? m * m : n * n;
     ptrdiff_t b_batch_stride = m * n;
     const T* a_ptr = static_cast<const T*>(A.pointer().raw(platform));
     T* b_ptr = static_cast<T*>(B.pointer().raw(platform));
@@ -213,23 +213,24 @@ static Error BlasTrsmBatch(
     wrapper::Pointer<T*> b_array_ptr(b_array, platform);
     auto cast_alpha_ptr = static_cast<wrapper::Pointer<const T>>(*alpha_ptr);
 
-    return wrapper::CublasTrsmBatched(
-        *current, handle.get(), side_mode,
-        wrapper::BlasFillMode::FromOpaqueValue(*fillMode),
-        wrapper::BlasOperation::FromOpaqueValue(*trans),
-        wrapper::BlasDiagType::FromOpaqueValue(*diagType), m, n, cast_alpha_ptr,
-        a_array_ptr, heightA, b_array_ptr, heightB, batchCount);
+    
+    return wrapper::RocblasTrsmBatched(
+         *current, handle.get(), side_mode,
+         wrapper::BlasFillMode::FromOpaqueValue(*fillMode),
+         wrapper::BlasOperation::FromOpaqueValue(*trans),
+         wrapper::BlasDiagType::FromOpaqueValue(*diagType), m, n, cast_alpha_ptr,
+         a_array_ptr, heightA, b_array_ptr, heightB, batchCount);
   };
 
   switch (data_type) {
-    case CUDA_R_32F:
+    case rocblas_datatype_f32_r:
       return call(float{});
-    case CUDA_R_64F:
+    case rocblas_datatype_f64_r:
       return call(double{});
-    case CUDA_C_32F:
-      return call(cuComplex{});
-    case CUDA_C_64F:
-      return call(cuDoubleComplex{});
+    case rocblas_datatype_f32_c:
+      return call(rocblas_float_complex{});
+    case rocblas_datatype_f64_c:
+      return call(rocblas_double_complex{});
     default:
       return MakeStringError("Unsupported data type ", data_type);
   }
diff --git a/backends/gpu/mlir_tests/rocm/BUILD b/backends/gpu/mlir_tests/rocm/BUILD
new file mode 100644
index 00000000000..b0284597910
--- /dev/null
+++ b/backends/gpu/mlir_tests/rocm/BUILD
@@ -0,0 +1,30 @@
+load("@tf_runtime//:build_defs.bzl", "if_oss")
+load("@tf_runtime//tools:mlir_to_bef.bzl", "glob_tfrt_lit_tests")
+
+licenses(["notice"])
+
+glob_tfrt_lit_tests(
+    data = [":test_utilities"],
+    # copybara:uncomment driver = "@tf_runtime//backends/gpu/mlir_tests:run_lit.sh",
+    tags_override = dict({
+        file: ["requires-gpu-nvidia"]
+        for file in glob(["*.mlir"])
+    }.items() + {
+        # Note: NCCL in the sandbox produces the wrong result in OSS.
+        "ccl.mlir": ["requires-gpu-nvidia:2"] + if_oss(["no-sandbox"]),
+    }.items()),
+    tfrt_translate = "@tf_runtime//backends/gpu:tfrt_gpu_translate",
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    srcs = [
+        "@llvm-project//llvm:FileCheck",
+        "@tf_runtime//backends/gpu:tfrt_gpu_executor",
+        "@tf_runtime//backends/gpu:tfrt_gpu_opt",
+        "@tf_runtime//backends/gpu:tfrt_gpu_translate",
+        "@tf_runtime//tools:bef_executor_lite",
+    ],
+)
diff --git a/backends/gpu/mlir_tests/rocm/blas.mlir b/backends/gpu/mlir_tests/rocm/blas.mlir
new file mode 100644
index 00000000000..fb2969165a6
--- /dev/null
+++ b/backends/gpu/mlir_tests/rocm/blas.mlir
@@ -0,0 +1,193 @@
+// Copyright 2020 The TensorFlow Runtime Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// RUN: bef_executor_lite %s.bef | FileCheck %s
+
+// CHECK-LABEL: --- Running 'blas_axpy'
+func @blas_axpy() {
+  %ch1 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+  %allocator = tfrt_gpu.allocator.create %context
+  %stream = tfrt_gpu.stream.create %context
+  %blas = tfrt_gpu.blas.create %context
+
+  %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats
+  %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes
+
+  %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64]
+  %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1
+
+  %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32]
+  %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3
+  %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32]
+  %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5
+  %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %stride = tfrt.constant.i32 1
+  %alpha = tfrt.constant.f32 1.0
+  %ch7 = tfrt_gpu.blas.axpy %blas, %stream, %buffer_length, %alpha, rocblas_datatype_f32_r, 
+         %gpu_buffer_0, rocblas_datatype_f32_r, %stride, %gpu_buffer_1, rocblas_datatype_f32_r, %stride,
+         rocblas_datatype_f32_r, %ch6
+
+  %ch8 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_1, %stream, %ch7 : !ht.host_buffer, !tfrt_gpu.buffer
+  %ch9 = tfrt_gpu.stream.synchronize %stream, %ch8
+  // CHECK: DenseHostTensor dtype = f32, shape = [2, 2]
+  // CHECK-SAME: values = [3.000000e+00, 5.000000e+00, 7.000000e+00, 9.000000e+00]
+  %ch10 = tfrt_dht.print_tensor %host_tensor, %ch9
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'blas_gemm'
+func @blas_gemm() {
+  %ch1 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+  %allocator = tfrt_gpu.allocator.create %context
+  %stream = tfrt_gpu.stream.create %context
+  %blas = tfrt_gpu.blas.create %context
+
+  %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats
+  %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes
+
+  %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64]
+  %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1
+
+  %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32]
+  %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3
+  %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32]
+  %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5
+  %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch7 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch6 [0.0 : f32, 0.0 : f32, 0.0 : f32, 0.0 : f32]
+  %gpu_buffer_2 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch7
+  %ch8 = tfrt_gpu.mem.copy %gpu_buffer_2, %host_buffer, %stream, %ch7 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %dim = tfrt.constant.i32 2
+  %alpha = tfrt.constant.f32 1.0
+  %beta = tfrt.constant.f32 1.0
+  %algo = tfrt_gpu.blas.gemm.algo rocblas_gemm_algo_standard
+  %ch9 = tfrt_gpu.blas.gemm %blas, %stream,
+    rocblas_operation_none, rocblas_operation_none, %dim, %dim, %dim,
+    %alpha, %gpu_buffer_0, rocblas_datatype_f32_r, %dim,
+    %gpu_buffer_1, rocblas_datatype_f32_r, %dim, %beta,
+    %gpu_buffer_2, rocblas_datatype_f32_r, %dim,
+    rocblas_datatype_f32_r, %algo, %ch8
+
+  %ch10 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_2, %stream, %ch9 : !ht.host_buffer, !tfrt_gpu.buffer
+  %ch11 = tfrt_gpu.stream.synchronize %stream, %ch10
+  // CHECK: DenseHostTensor dtype = f32, shape = [2, 2]
+  // CHECK-SAME: values = [1.100000e+01, 1.600000e+01, 1.900000e+01, 2.800000e+01]
+  %ch12 = tfrt_dht.print_tensor %host_tensor, %ch11
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'blas_gemm_batched'
+func @blas_gemm_batched() {
+  %ch1 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+  %allocator = tfrt_gpu.allocator.create %context
+  %stream = tfrt_gpu.stream.create %context
+  %blas = tfrt_gpu.blas.create %context
+
+  %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats
+  %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes
+
+  %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64]
+  %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1
+
+  %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32]
+  %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3
+  %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32]
+  %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5
+  %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch7 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch6 [0.0 : f32, 0.0 : f32, 0.0 : f32, 0.0 : f32]
+  %gpu_buffer_2 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch7
+  %ch8 = tfrt_gpu.mem.copy %gpu_buffer_2, %host_buffer, %stream, %ch7 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %dim = tfrt.constant.i32 2
+  %type = tfrt.constant.i32 0
+  %algo = tfrt_gpu.blas.gemm.algo rocblas_gemm_algo_standard
+  %alpha = tfrt.constant.f32 1.0
+  %beta = tfrt.constant.f32 1.0
+  %batch_count = tfrt.constant.i32 1
+  %stride = tfrt.constant.i64 1
+  %ch9 = tfrt_gpu.blas.gemm.batch %blas, %stream,
+    rocblas_operation_none, rocblas_operation_none, %dim, %dim, %dim,
+    %alpha, %gpu_buffer_0, rocblas_datatype_f32_r, %dim, %stride,
+    %gpu_buffer_1, rocblas_datatype_f32_r, %dim, %stride, %beta,
+    %gpu_buffer_2, rocblas_datatype_f32_r, %dim, %stride, %batch_count,
+    rocblas_datatype_f32_r, %algo, %ch8
+
+  %ch10 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_2, %stream, %ch9 : !ht.host_buffer, !tfrt_gpu.buffer
+  %ch11 = tfrt_gpu.stream.synchronize %stream, %ch10
+  // CHECK: DenseHostTensor dtype = f32, shape = [2, 2]
+  // CHECK-SAME: values = [1.100000e+01, 1.600000e+01, 1.900000e+01, 2.800000e+01]
+  %ch12 = tfrt_dht.print_tensor %host_tensor, %ch11
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'blas_trsm_batched'
+func @blas_trsm_batched() {
+  %ch0 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+  %allocator = tfrt_gpu.allocator.create %context
+  %stream = tfrt_gpu.stream.create %context
+  %blas = tfrt_gpu.blas.create %context
+
+  %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes
+
+  %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64]
+  %host_buffer, %ch1 = tfrt_dht.get_buffer %host_tensor, %ch0
+
+  %ch2 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch1 [1.0 : f32, 2.0 : f32, 0.0 : f32, 1.0 : f32]
+  %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch2
+  %ch3 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch2 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch4 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch3 [1.0 : f32, 4.0 : f32, 0.0 : f32, 0.0 : f32]
+  %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch4
+  %ch5 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch4 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %dim = tfrt.constant.i32 2
+  %alpha = tfrt.constant.f32 1.0
+  %batch_count = tfrt.constant.i32 1
+  %ch6 = tfrt_gpu.blas.trsm.batch %blas, %stream, rocblas_side_left,
+    rocblas_fill_lower, rocblas_operation_none, rocblas_diagonal_unit, %dim, %dim,
+    rocblas_datatype_f32_r, %alpha, %gpu_buffer_0, %dim, %gpu_buffer_1, %dim, %batch_count,
+    %ch5
+
+  %ch7 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_1, %stream, %ch6 : !ht.host_buffer, !tfrt_gpu.buffer
+  %ch8 = tfrt_gpu.stream.synchronize %stream, %ch7
+  // CHECK: DenseHostTensor dtype = f32, shape = [2, 2]
+  // CHECK-SAME: values = [1.000000e+00, 2.000000e+00, 0.000000e+00, 0.000000e+00]
+  %ch9 = tfrt_dht.print_tensor %host_tensor, %ch8
+
+  tfrt.return
+}
diff --git a/backends/gpu/tools/stub_codegen/hip.json b/backends/gpu/tools/stub_codegen/hip.json
index ceb91551159..307fa302c38 100644
--- a/backends/gpu/tools/stub_codegen/hip.json
+++ b/backends/gpu/tools/stub_codegen/hip.json
@@ -1,9 +1,9 @@
 {
-   "header":"/opt/rocm-4.2.0/include/hip/hip_runtime.h",
+   "header":"/opt/rocm-4.5.2/include/hip/hip_runtime.h",
    "extra_args":[
       "-D__HIP_PLATFORM_AMD__",
       "-I.",
-      "-I/opt/rocm-4.2.0/include/",
+      "-I/opt/rocm-4.5.2/include/",
       "-Ithird_party/llvm/llvm-project/clang/lib/Headers",
       "-Ibazel-genfiles",
       "-ferror-limit=0"
@@ -13,6 +13,7 @@
       "hipDeviceAttribute_t",
       "hipJitOption",
       "hipLimit_t",
+      "hipDataType",
       "hipMemoryType",
       "hipMemcpyKind",
       "hipFunction_attribute",
diff --git a/backends/gpu/tools/stub_codegen/hipfft.json b/backends/gpu/tools/stub_codegen/hipfft.json
index d9d9179b31d..802b86c55c8 100644
--- a/backends/gpu/tools/stub_codegen/hipfft.json
+++ b/backends/gpu/tools/stub_codegen/hipfft.json
@@ -1,8 +1,8 @@
 {
-   "header":"/opt/rocm-4.2.0/hipfft/include/hipfft.h",
+   "header":"/opt/rocm-4.5.2/hipfft/include/hipfft.h",
    "extra_args":[
       "-I.",
-      "-I/opt/rocm-4.2.0/include/"
+      "-I/opt/rocm-4.5.2/include/"
    ],
    "enums":[
       "hipfftLibraryPropertyType",
diff --git a/backends/gpu/tools/stub_codegen/miopen.json b/backends/gpu/tools/stub_codegen/miopen.json
index 57c9983db12..25277eb6ed0 100644
--- a/backends/gpu/tools/stub_codegen/miopen.json
+++ b/backends/gpu/tools/stub_codegen/miopen.json
@@ -1,8 +1,8 @@
 {
-   "header":"/opt/rocm-4.2.0/include/miopen/miopen.h",
+   "header":"/opt/rocm-4.5.2/include/miopen/miopen.h",
    "extra_args":[
       "-I.",
-      "-I/opt/rocm-4.2.0/include/"
+      "-I/opt/rocm-4.5.2/include/"
    ],
    "enums":[
       "miopenStatus_t",
diff --git a/backends/gpu/tools/stub_codegen/rocblas.json b/backends/gpu/tools/stub_codegen/rocblas.json
index c071842474b..8d8c22966c8 100644
--- a/backends/gpu/tools/stub_codegen/rocblas.json
+++ b/backends/gpu/tools/stub_codegen/rocblas.json
@@ -1,8 +1,8 @@
 {
-   "header":"/opt/rocm-4.2.0/include/rocblas.h",
+   "header":"/opt/rocm-4.5.2/include/rocblas.h",
    "extra_args":[
       "-I.",
-      "-I/opt/rocm-4.2.0/include/"
+      "-I/opt/rocm-4.5.2/include/"
    ],
    "enums":[
       "rocblas_operation",
diff --git a/backends/gpu/tools/stub_codegen/rocsolver.json b/backends/gpu/tools/stub_codegen/rocsolver.json
index c72ba7a891c..053b8895b64 100644
--- a/backends/gpu/tools/stub_codegen/rocsolver.json
+++ b/backends/gpu/tools/stub_codegen/rocsolver.json
@@ -1,8 +1,8 @@
 {
-   "header":"/opt/rocm-4.2.0/include/rocsolver.h",
+   "header":"/opt/rocm-4.5.2/include/rocsolver.h",
    "extra_args":[
       "-I.",
-      "-I/opt/rocm-4.2.0/include/"
+      "-I/opt/rocm-4.5.2/include/"
    ],
    "functions":[
       "rocsolver_spotrf",

From 85633100c15b0091d42fb58d446542157f88264b Mon Sep 17 00:00:00 2001
From: weihanmines <weihan13@amd.com>
Date: Wed, 19 Jan 2022 16:08:43 +0000
Subject: [PATCH 2/3] inc files added

---
 third_party/hip/hip_stub.cc.inc     | 33 ++++++------
 third_party/hip/hip_stub.h.inc      | 34 +++++++++---
 third_party/hip/miopen_stub.h.inc   |  1 +
 third_party/hip/rocblas_stub.cc.inc | 84 ++++++++++++++---------------
 third_party/hip/rocblas_stub.h.inc  | 53 +++++++++---------
 5 files changed, 113 insertions(+), 92 deletions(-)

diff --git a/third_party/hip/hip_stub.cc.inc b/third_party/hip/hip_stub.cc.inc
index fb17e9af57e..1a0431d3ca0 100644
--- a/third_party/hip/hip_stub.cc.inc
+++ b/third_party/hip/hip_stub.cc.inc
@@ -12,16 +12,6 @@ hipError_t hipRuntimeGetVersion(int* runtimeVersion) {
       "hipRuntimeGetVersion", runtimeVersion);
 }
 
-hipError_t hipGetLastError(void) {
-  return DynamicCall<decltype(hipGetLastError), &hipGetLastError>(
-      "hipGetLastError");
-}
-
-hipError_t hipPeekAtLastError(void) {
-  return DynamicCall<decltype(hipPeekAtLastError), &hipPeekAtLastError>(
-      "hipPeekAtLastError");
-}
-
 hipError_t hipDeviceGet(hipDevice_t* device, int ordinal) {
   return DynamicCall<decltype(hipDeviceGet), &hipDeviceGet>("hipDeviceGet",
                                                             device, ordinal);
@@ -73,6 +63,16 @@ hipError_t hipDeviceGetLimit(size_t* pValue, enum hipLimit_t limit) {
       "hipDeviceGetLimit", pValue, limit);
 }
 
+hipError_t hipGetLastError(void) {
+  return DynamicCall<decltype(hipGetLastError), &hipGetLastError>(
+      "hipGetLastError");
+}
+
+hipError_t hipPeekAtLastError(void) {
+  return DynamicCall<decltype(hipPeekAtLastError), &hipPeekAtLastError>(
+      "hipPeekAtLastError");
+}
+
 hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) {
   return DynamicCall<decltype(hipStreamCreateWithFlags),
                      &hipStreamCreateWithFlags>("hipStreamCreateWithFlags",
@@ -209,6 +209,12 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes,
                                                       sizeBytes, kind);
 }
 
+hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
+                              hipModule_t hmod, const char* name) {
+  return DynamicCall<decltype(hipModuleGetGlobal), &hipModuleGetGlobal>(
+      "hipModuleGetGlobal", dptr, bytes, hmod, name);
+}
+
 hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
                           hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
   return DynamicCall<decltype(hipMemcpyAsync), &hipMemcpyAsync>(
@@ -376,13 +382,6 @@ hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module,
       "hipModuleGetFunction", function, module, kname);
 }
 
-hipError_t hipModuleGetGlobal(void** ptr, size_t* bytes, hipModule_t module,
-                              const char* kname) {
-  return DynamicCall<decltype(hipModuleGetGlobal), &hipModuleGetGlobal>(
-      "hipModuleGetGlobal", ptr, bytes, module, kname);
-}
-
-
 hipError_t hipFuncGetAttributes(struct hipFuncAttributes* attr,
                                 const void* func) {
   return DynamicCall<decltype(hipFuncGetAttributes), &hipFuncGetAttributes>(
diff --git a/third_party/hip/hip_stub.h.inc b/third_party/hip/hip_stub.h.inc
index fc8d5fae4ff..c7dcedec42b 100644
--- a/third_party/hip/hip_stub.h.inc
+++ b/third_party/hip/hip_stub.h.inc
@@ -62,12 +62,22 @@ enum hipError_t {
   hipErrorPeerAccessAlreadyEnabled = 704,
   hipErrorPeerAccessNotEnabled = 705,
   hipErrorSetOnActiveProcess = 708,
+  hipErrorContextIsDestroyed = 709,
   hipErrorAssert = 710,
   hipErrorHostMemoryAlreadyRegistered = 712,
   hipErrorHostMemoryNotRegistered = 713,
   hipErrorLaunchFailure = 719,
   hipErrorCooperativeLaunchTooLarge = 720,
   hipErrorNotSupported = 801,
+  hipErrorStreamCaptureUnsupported = 900,
+  hipErrorStreamCaptureInvalidated = 901,
+  hipErrorStreamCaptureMerge = 902,
+  hipErrorStreamCaptureUnmatched = 903,
+  hipErrorStreamCaptureUnjoined = 904,
+  hipErrorStreamCaptureIsolation = 905,
+  hipErrorStreamCaptureImplicit = 906,
+  hipErrorCapturedEvent = 907,
+  hipErrorStreamCaptureWrongThread = 908,
   hipErrorUnknown = 999,
   hipErrorRuntimeMemory = 1052,
   hipErrorRuntimeOther = 1053,
@@ -153,6 +163,7 @@ enum hipFunction_attribute {
 };
 
 enum hipLimit_t {
+  hipLimitPrintfFifoSize = 0x01,
   hipLimitMallocHeapSize = 0x02,
 };
 
@@ -196,10 +207,6 @@ hipError_t hipDriverGetVersion(int* driverVersion);
 
 hipError_t hipRuntimeGetVersion(int* runtimeVersion);
 
-hipError_t hipGetLastError(void);
-
-hipError_t hipPeekAtLastError(void);
-
 hipError_t hipDeviceGet(hipDevice_t* device, int ordinal);
 
 hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);
@@ -221,6 +228,10 @@ hipError_t hipGetDeviceProperties(hipDeviceProp_t* prop, int deviceId);
 
 hipError_t hipDeviceGetLimit(size_t* pValue, enum hipLimit_t limit);
 
+hipError_t hipGetLastError(void);
+
+hipError_t hipPeekAtLastError(void);
+
 hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags);
 
 hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags,
@@ -278,6 +289,9 @@ hipError_t hipHostFree(void* ptr);
 hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes,
                      hipMemcpyKind kind);
 
+hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
+                              hipModule_t hmod, const char* name);
+
 hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
                           hipMemcpyKind kind, hipStream_t stream __dparm(0));
 
@@ -353,9 +367,6 @@ hipError_t hipModuleUnload(hipModule_t module);
 hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module,
                                 const char* kname);
 
-hipError_t hipModuleGetGlobal(void** ptr, size_t* bytes, hipModule_t module,
-                                const char* kname);
-
 hipError_t hipFuncGetAttributes(struct hipFuncAttributes* attr,
                                 const void* func);
 
@@ -385,3 +396,12 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
                                              const void* f,
                                              size_t dynSharedMemPerBlk,
                                              int blockSizeLimit);
+
+enum hipDataType {
+  HIP_R_16F = 2,
+  HIP_R_32F = 0,
+  HIP_R_64F = 1,
+  HIP_C_16F = 6,
+  HIP_C_32F = 4,
+  HIP_C_64F = 5,
+};
diff --git a/third_party/hip/miopen_stub.h.inc b/third_party/hip/miopen_stub.h.inc
index b3794d392aa..2ccf9e06b54 100644
--- a/third_party/hip/miopen_stub.h.inc
+++ b/third_party/hip/miopen_stub.h.inc
@@ -46,6 +46,7 @@ typedef enum {
   miopenInt8 = 3,
   miopenInt8x4 = 4,
   miopenBFloat16 = 5,
+  miopenDouble = 6,
 } miopenDataType_t;
 
 typedef enum {
diff --git a/third_party/hip/rocblas_stub.cc.inc b/third_party/hip/rocblas_stub.cc.inc
index 40c5d05975c..3c10e0c723d 100644
--- a/third_party/hip/rocblas_stub.cc.inc
+++ b/third_party/hip/rocblas_stub.cc.inc
@@ -34,48 +34,6 @@ ROCBLAS_EXPORT rocblas_status rocblas_get_pointer_mode(
                                                 handle, pointer_mode);
 }
 
-ROCBLAS_EXPORT rocblas_status rocblas_gemm_ex(
-    rocblas_handle handle, rocblas_operation transA, rocblas_operation transB,
-    rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha,
-    const void* a, rocblas_datatype a_type, rocblas_int lda, const void* b,
-    rocblas_datatype b_type, rocblas_int ldb, const void* beta, const void* c,
-    rocblas_datatype c_type, rocblas_int ldc, void* d, rocblas_datatype d_type,
-    rocblas_int ldd, rocblas_datatype compute_type, rocblas_gemm_algo algo,
-    int32_t solution_index, uint32_t flags) {
-  return DynamicCall<decltype(rocblas_gemm_ex), &rocblas_gemm_ex>(
-      "rocblas_gemm_ex", handle, transA, transB, m, n, k, alpha, a, a_type, lda,
-      b, b_type, ldb, beta, c, c_type, ldc, d, d_type, ldd, compute_type, algo,
-      solution_index, flags);
-}
-
-ROCBLAS_EXPORT rocblas_status rocblas_gemm_strided_batched_ex(
-    rocblas_handle handle, rocblas_operation transA, rocblas_operation transB,
-    rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha,
-    const void* a, rocblas_datatype a_type, rocblas_int lda,
-    rocblas_stride stride_a, const void* b, rocblas_datatype b_type,
-    rocblas_int ldb, rocblas_stride stride_b, const void* beta, const void* c,
-    rocblas_datatype c_type, rocblas_int ldc, rocblas_stride stride_c, void* d,
-    rocblas_datatype d_type, rocblas_int ldd, rocblas_stride stride_d,
-    rocblas_int batch_count, rocblas_datatype compute_type,
-    rocblas_gemm_algo algo, int32_t solution_index, uint32_t flags) {
-  return DynamicCall<decltype(rocblas_gemm_strided_batched_ex),
-                     &rocblas_gemm_strided_batched_ex>(
-      "rocblas_gemm_strided_batched_ex", handle, transA, transB, m, n, k, alpha,
-      a, a_type, lda, stride_a, b, b_type, ldb, stride_b, beta, c, c_type, ldc,
-      stride_c, d, d_type, ldd, stride_d, batch_count, compute_type, algo,
-      solution_index, flags);
-}
-
-ROCBLAS_EXPORT rocblas_status rocblas_axpy_ex(
-    rocblas_handle handle, rocblas_int n, const void* alpha,
-    rocblas_datatype alpha_type, const void* x, rocblas_datatype x_type,
-    rocblas_int incx, void* y, rocblas_datatype y_type, rocblas_int incy,
-    rocblas_datatype execution_type) {
-  return DynamicCall<decltype(rocblas_axpy_ex), &rocblas_axpy_ex>(
-      "rocblas_axpy_ex", handle, n, alpha, alpha_type, x, x_type, incx, y,
-      y_type, incy, execution_type);
-}
-
 ROCBLAS_EXPORT rocblas_status rocblas_strsm_batched(
     rocblas_handle handle, rocblas_side side, rocblas_fill uplo,
     rocblas_operation transA, rocblas_diagonal diag, rocblas_int m,
@@ -120,3 +78,45 @@ ROCBLAS_EXPORT rocblas_status rocblas_ztrsm_batched(
       "rocblas_ztrsm_batched", handle, side, uplo, transA, diag, m, n, alpha, A,
       lda, B, ldb, batch_count);
 }
+
+ROCBLAS_EXPORT rocblas_status rocblas_gemm_ex(
+    rocblas_handle handle, rocblas_operation transA, rocblas_operation transB,
+    rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha,
+    const void* a, rocblas_datatype a_type, rocblas_int lda, const void* b,
+    rocblas_datatype b_type, rocblas_int ldb, const void* beta, const void* c,
+    rocblas_datatype c_type, rocblas_int ldc, void* d, rocblas_datatype d_type,
+    rocblas_int ldd, rocblas_datatype compute_type, rocblas_gemm_algo algo,
+    int32_t solution_index, uint32_t flags) {
+  return DynamicCall<decltype(rocblas_gemm_ex), &rocblas_gemm_ex>(
+      "rocblas_gemm_ex", handle, transA, transB, m, n, k, alpha, a, a_type, lda,
+      b, b_type, ldb, beta, c, c_type, ldc, d, d_type, ldd, compute_type, algo,
+      solution_index, flags);
+}
+
+ROCBLAS_EXPORT rocblas_status rocblas_gemm_strided_batched_ex(
+    rocblas_handle handle, rocblas_operation transA, rocblas_operation transB,
+    rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha,
+    const void* a, rocblas_datatype a_type, rocblas_int lda,
+    rocblas_stride stride_a, const void* b, rocblas_datatype b_type,
+    rocblas_int ldb, rocblas_stride stride_b, const void* beta, const void* c,
+    rocblas_datatype c_type, rocblas_int ldc, rocblas_stride stride_c, void* d,
+    rocblas_datatype d_type, rocblas_int ldd, rocblas_stride stride_d,
+    rocblas_int batch_count, rocblas_datatype compute_type,
+    rocblas_gemm_algo algo, int32_t solution_index, uint32_t flags) {
+  return DynamicCall<decltype(rocblas_gemm_strided_batched_ex),
+                     &rocblas_gemm_strided_batched_ex>(
+      "rocblas_gemm_strided_batched_ex", handle, transA, transB, m, n, k, alpha,
+      a, a_type, lda, stride_a, b, b_type, ldb, stride_b, beta, c, c_type, ldc,
+      stride_c, d, d_type, ldd, stride_d, batch_count, compute_type, algo,
+      solution_index, flags);
+}
+
+ROCBLAS_EXPORT rocblas_status rocblas_axpy_ex(
+    rocblas_handle handle, rocblas_int n, const void* alpha,
+    rocblas_datatype alpha_type, const void* x, rocblas_datatype x_type,
+    rocblas_int incx, void* y, rocblas_datatype y_type, rocblas_int incy,
+    rocblas_datatype execution_type) {
+  return DynamicCall<decltype(rocblas_axpy_ex), &rocblas_axpy_ex>(
+      "rocblas_axpy_ex", handle, n, alpha, alpha_type, x, x_type, incx, y,
+      y_type, incy, execution_type);
+}
diff --git a/third_party/hip/rocblas_stub.h.inc b/third_party/hip/rocblas_stub.h.inc
index 7f8318362b6..eddf6ec949c 100644
--- a/third_party/hip/rocblas_stub.h.inc
+++ b/third_party/hip/rocblas_stub.h.inc
@@ -71,6 +71,7 @@ typedef enum rocblas_gemm_algo_ {
 typedef enum rocblas_gemm_flags_ {
   rocblas_gemm_flags_none = 0x0,
   rocblas_gemm_flags_pack_int8x4 = 0x1,
+  rocblas_gemm_flags_use_cu_efficiency = 0x2,
 } rocblas_gemm_flags;
 
 ROCBLAS_EXPORT rocblas_status rocblas_create_handle(rocblas_handle* handle);
@@ -89,32 +90,6 @@ ROCBLAS_EXPORT rocblas_status rocblas_set_pointer_mode(
 ROCBLAS_EXPORT rocblas_status rocblas_get_pointer_mode(
     rocblas_handle handle, rocblas_pointer_mode* pointer_mode);
 
-ROCBLAS_EXPORT rocblas_status rocblas_gemm_ex(
-    rocblas_handle handle, rocblas_operation transA, rocblas_operation transB,
-    rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha,
-    const void* a, rocblas_datatype a_type, rocblas_int lda, const void* b,
-    rocblas_datatype b_type, rocblas_int ldb, const void* beta, const void* c,
-    rocblas_datatype c_type, rocblas_int ldc, void* d, rocblas_datatype d_type,
-    rocblas_int ldd, rocblas_datatype compute_type, rocblas_gemm_algo algo,
-    int32_t solution_index, uint32_t flags);
-
-ROCBLAS_EXPORT rocblas_status rocblas_gemm_strided_batched_ex(
-    rocblas_handle handle, rocblas_operation transA, rocblas_operation transB,
-    rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha,
-    const void* a, rocblas_datatype a_type, rocblas_int lda,
-    rocblas_stride stride_a, const void* b, rocblas_datatype b_type,
-    rocblas_int ldb, rocblas_stride stride_b, const void* beta, const void* c,
-    rocblas_datatype c_type, rocblas_int ldc, rocblas_stride stride_c, void* d,
-    rocblas_datatype d_type, rocblas_int ldd, rocblas_stride stride_d,
-    rocblas_int batch_count, rocblas_datatype compute_type,
-    rocblas_gemm_algo algo, int32_t solution_index, uint32_t flags);
-
-ROCBLAS_EXPORT rocblas_status rocblas_axpy_ex(
-    rocblas_handle handle, rocblas_int n, const void* alpha,
-    rocblas_datatype alpha_type, const void* x, rocblas_datatype x_type,
-    rocblas_int incx, void* y, rocblas_datatype y_type, rocblas_int incy,
-    rocblas_datatype execution_type);
-
 ROCBLAS_EXPORT rocblas_status rocblas_strsm_batched(
     rocblas_handle handle, rocblas_side side, rocblas_fill uplo,
     rocblas_operation transA, rocblas_diagonal diag, rocblas_int m,
@@ -142,3 +117,29 @@ ROCBLAS_EXPORT rocblas_status rocblas_ztrsm_batched(
     const rocblas_double_complex* const A[], rocblas_int lda,
     rocblas_double_complex* const B[], rocblas_int ldb,
     rocblas_int batch_count);
+
+ROCBLAS_EXPORT rocblas_status rocblas_gemm_ex(
+    rocblas_handle handle, rocblas_operation transA, rocblas_operation transB,
+    rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha,
+    const void* a, rocblas_datatype a_type, rocblas_int lda, const void* b,
+    rocblas_datatype b_type, rocblas_int ldb, const void* beta, const void* c,
+    rocblas_datatype c_type, rocblas_int ldc, void* d, rocblas_datatype d_type,
+    rocblas_int ldd, rocblas_datatype compute_type, rocblas_gemm_algo algo,
+    int32_t solution_index, uint32_t flags);
+
+ROCBLAS_EXPORT rocblas_status rocblas_gemm_strided_batched_ex(
+    rocblas_handle handle, rocblas_operation transA, rocblas_operation transB,
+    rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha,
+    const void* a, rocblas_datatype a_type, rocblas_int lda,
+    rocblas_stride stride_a, const void* b, rocblas_datatype b_type,
+    rocblas_int ldb, rocblas_stride stride_b, const void* beta, const void* c,
+    rocblas_datatype c_type, rocblas_int ldc, rocblas_stride stride_c, void* d,
+    rocblas_datatype d_type, rocblas_int ldd, rocblas_stride stride_d,
+    rocblas_int batch_count, rocblas_datatype compute_type,
+    rocblas_gemm_algo algo, int32_t solution_index, uint32_t flags);
+
+ROCBLAS_EXPORT rocblas_status rocblas_axpy_ex(
+    rocblas_handle handle, rocblas_int n, const void* alpha,
+    rocblas_datatype alpha_type, const void* x, rocblas_datatype x_type,
+    rocblas_int incx, void* y, rocblas_datatype y_type, rocblas_int incy,
+    rocblas_datatype execution_type);

From db2ced44941cd056047bd5f706a05ce744749d4a Mon Sep 17 00:00:00 2001
From: weihanmines <weihan13@amd.com>
Date: Mon, 7 Mar 2022 23:39:37 +0000
Subject: [PATCH 3/3] add ptx alternatives in ROCm

---
 .../include/tfrt/gpu/wrapper/hip_forwards.h   |  2 +
 .../gpu/include/tfrt/gpu/wrapper/hip_stub.h   | 23 +++++
 .../include/tfrt/gpu/wrapper/hip_wrapper.h    |  3 +
 backends/gpu/lib/kernels/driver_kernels.cc    |  1 -
 backends/gpu/lib/wrapper/driver_wrapper.cc    |  2 +-
 backends/gpu/lib/wrapper/hip_stub.cc          | 89 ++++++++++++++++++
 backends/gpu/lib/wrapper/hip_wrapper.cc       | 45 +++++++++
 backends/gpu/mlir_tests/rocm/module.mlir      | 94 +++++++++++++++++++
 backends/gpu/tools/stub_codegen/generate.sh   |  9 +-
 backends/gpu/tools/stub_codegen/hipfft.json   |  2 +-
 backends/gpu/tools/stub_codegen/hiprtc.json   | 16 ++++
 backends/gpu/tools/stub_codegen/miopen.json   |  2 +-
 backends/gpu/tools/stub_codegen/rocblas.json  |  2 +-
 .../gpu/tools/stub_codegen/rocsolver.json     |  2 +-
 third_party/hip/hip_stub.h.inc                | 14 +++
 third_party/hip/hipfft_stub.cc.inc            | 22 +++--
 third_party/hip/hipfft_stub.h.inc             | 11 ++-
 17 files changed, 317 insertions(+), 22 deletions(-)
 create mode 100644 backends/gpu/mlir_tests/rocm/module.mlir
 create mode 100644 backends/gpu/tools/stub_codegen/hiprtc.json

diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h
index dc3e71ff986..b400636a090 100644
--- a/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h
+++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h
@@ -59,6 +59,8 @@ using ncclComm_t = struct ncclComm *;
 
 // Forward declaration of hipFFT types.
 using hipfftHandle = struct hipfftHandle_t *;
+// Forward declaration of  hiprtcProgram
+using hiprtcProgram = struct _hiprtcProgram *;
 // Enums for corresponding #defines in the hipFFT headers.
 enum hipfftDirection_t : int {
   HIPFFT_FORWARD = -1,
diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h
index 3908eed5205..646aa532827 100644
--- a/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h
+++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h
@@ -52,6 +52,29 @@ extern "C" {
 
 const char* hipGetErrorName(hipError_t hip_error);
 const char* hipGetErrorString(hipError_t hip_error);
+const char *hiprtcGetErrorString(hiprtcResult result);
+hiprtcResult hiprtcVersion(int* major, int* minor);
+hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression);
+hiprtcResult hiprtcCompileProgram(
+                                   hiprtcProgram prog,
+                                   int numOptions,
+                                   const char** options);
+hiprtcResult hiprtcCreateProgram(
+                                  hiprtcProgram* prog,
+                                  const char* src,
+                                  const char* name,
+                                  int numberHeaders,
+                                  char** headers,
+                                  const char** includeNames);
+hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog);
+hiprtcResult hiprtcGetLoweredName(
+                                  hiprtcProgram prog,
+                                  const char* name_expression,
+                                  const char** lowered_name);
+hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log);
+hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet);
+hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code);
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet);
 
 // Enums for corresponding #defines in the HIP headers.
 enum hipDeviceFlags_t {
diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h
index 98a3f0bc0fd..92feac7d949 100644
--- a/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h
+++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h
@@ -29,6 +29,7 @@ namespace gpu {
 namespace wrapper {
 
 raw_ostream& Print(raw_ostream& os, hipError_t error);
+raw_ostream& Print(raw_ostream& os, hiprtcResult result);
 
 namespace internal {
 template <>
@@ -162,6 +163,8 @@ llvm::Error HipMemsetD32Async(CurrentContext current, Pointer<void> dst,
 
 llvm::Expected<OwningModule> HipModuleLoadData(CurrentContext current,
                                                const void* image);
+llvm::Expected<OwningModule> HipRTCModuleLoadData(CurrentContext current,
+                                                  const void* image);
 llvm::Expected<OwningModule> HipModuleLoadDataEx(
     CurrentContext current, const void* image,
     llvm::ArrayRef<hipJitOption> options, llvm::ArrayRef<void*> option_values);
diff --git a/backends/gpu/lib/kernels/driver_kernels.cc b/backends/gpu/lib/kernels/driver_kernels.cc
index 6feb245ceff..8a5d40c204e 100644
--- a/backends/gpu/lib/kernels/driver_kernels.cc
+++ b/backends/gpu/lib/kernels/driver_kernels.cc
@@ -320,7 +320,6 @@ static Expected<GpuModule> GpuModuleLoad(Argument<GpuContext> context,
                             MakeStringError("GPU JIT error log: ", error_log));
   }
 #endif
-
   return GpuModule(context.ValueRef(), std::move(*module));
 }
 
diff --git a/backends/gpu/lib/wrapper/driver_wrapper.cc b/backends/gpu/lib/wrapper/driver_wrapper.cc
index ef4574980da..7e11316d95a 100644
--- a/backends/gpu/lib/wrapper/driver_wrapper.cc
+++ b/backends/gpu/lib/wrapper/driver_wrapper.cc
@@ -852,7 +852,7 @@ llvm::Expected<OwningModule> ModuleLoadData(CurrentContext current,
     case Platform::CUDA:
       return CuModuleLoadData(current, image);
     case Platform::ROCm:
-      return HipModuleLoadData(current, image);
+      return HipRTCModuleLoadData(current, image);
     default:
       return InvalidPlatform(platform);
   }
diff --git a/backends/gpu/lib/wrapper/hip_stub.cc b/backends/gpu/lib/wrapper/hip_stub.cc
index 68a650d3bdf..539738d9339 100644
--- a/backends/gpu/lib/wrapper/hip_stub.cc
+++ b/backends/gpu/lib/wrapper/hip_stub.cc
@@ -61,3 +61,92 @@ const char *hipGetErrorString(hipError_t hip_error) {
   if (!func_ptr) return "FAILED_TO_LOAD_FUNCTION_SYMBOL";
   return func_ptr(hip_error);
 }
+
+const char *hiprtcGetErrorString(hiprtcResult result) {
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetErrorString", hiprtcGetErrorString);
+  if (!func_ptr) return "FAILED_TO_LOAD_FUNCTION_SYMBOL";
+  return func_ptr(result);
+}
+
+hiprtcResult hiprtcVersion(int* major, int* minor){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcVersion", hiprtcVersion);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(major, minor);
+}
+
+hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcAddNameExpression", hiprtcAddNameExpression);
+  if (!func_ptr) return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
+  return func_ptr(prog, name_expression);
+}
+
+hiprtcResult hiprtcCompileProgram(
+                                   hiprtcProgram prog,
+                                   int numOptions,
+                                   const char** options){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcCompileProgram", hiprtcCompileProgram);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, numOptions, options);
+}
+
+hiprtcResult hiprtcCreateProgram(
+                                  hiprtcProgram* prog,
+                                  const char* src,
+                                  const char* name,
+                                  int numberHeaders,
+                                  char** headers,
+                                  const char** includeNames){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcCreateProgram", hiprtcCreateProgram);
+  if (!func_ptr) return HIPRTC_ERROR_PROGRAM_CREATION_FAILURE;
+  return func_ptr(prog, src, name, numberHeaders, headers, includeNames);
+}
+
+hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcDestroyProgram", hiprtcDestroyProgram);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog);
+}
+
+hiprtcResult hiprtcGetLoweredName(
+                                  hiprtcProgram prog,
+                                  const char* name_expression,
+                                  const char** lowered_name){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetLoweredName", hiprtcGetLoweredName);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, name_expression, lowered_name);
+}
+
+hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetProgramLog", hiprtcGetProgramLog);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, log);
+}
+
+hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetProgramLogSize", hiprtcGetProgramLogSize);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, logSizeRet);
+}
+
+hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetCode", hiprtcGetCode);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, code);
+}
+
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetCodeSize", hiprtcGetCodeSize);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, codeSizeRet);
+}
diff --git a/backends/gpu/lib/wrapper/hip_wrapper.cc b/backends/gpu/lib/wrapper/hip_wrapper.cc
index 052c06f1f75..cf37c4ef03e 100644
--- a/backends/gpu/lib/wrapper/hip_wrapper.cc
+++ b/backends/gpu/lib/wrapper/hip_wrapper.cc
@@ -37,6 +37,12 @@ llvm::raw_ostream& Print(llvm::raw_ostream& os, hipError_t error) {
   return os;
 }
 
+llvm::raw_ostream& Print(llvm::raw_ostream& os, hiprtcResult result) {
+  const char* msg = hiprtcGetErrorString(result);
+  if (msg != nullptr) os << "hiprtc Error: (" << msg << ")";
+  return os;
+}
+
 // Convert wrapper types to HIP types.
 static hipDevice_t ToRocm(Device device) { return device.id(Platform::ROCm); }
 
@@ -540,6 +546,45 @@ llvm::Expected<OwningModule> HipModuleLoadData(CurrentContext current,
   return OwningModule(module);
 }
 
+llvm::Expected<OwningModule> HipRTCModuleLoadData(CurrentContext current,
+                                                  const void* image) {
+  CheckHipContext(current);
+  hiprtcProgram prog;
+  //auto img = reinterpret_cast<const char*>(const_cast<void*>(image));
+  auto kernel = static_cast<const char*>(image);
+  std::string kname(kernel);
+  kname += ".cu";
+  RETURN_IF_ERROR(hiprtcCreateProgram(&prog,
+                                      kernel,
+                                      kname.c_str(),
+                                      0,
+                                      nullptr,
+                                      nullptr
+                                     ));
+  hiprtcResult compileResult = hiprtcCompileProgram(prog, 0, nullptr);
+  if (compileResult != HIPRTC_SUCCESS) {
+    size_t logSize;
+    hiprtcGetProgramLogSize(prog, &logSize);
+    if (logSize) {
+      std::string log(logSize, '\0');
+      hiprtcGetProgramLog(prog, &log[0]);
+      MakeStringError(log.c_str());
+    }
+  }
+
+  size_t code_size;
+  RETURN_IF_ERROR(hiprtcGetCodeSize(prog, &code_size));
+  std::vector<char> code(code_size);
+  RETURN_IF_ERROR(hiprtcGetCode(prog, code.data()));
+  RETURN_IF_ERROR(hiprtcDestroyProgram(&prog));
+
+  hipModule_t module;
+  RETURN_IF_ERROR(hipModuleLoadData(&module, code.data()));
+
+  NotifyResourceCreated(ResourceType::kModule, module);
+  return OwningModule(module);
+}
+
 llvm::Expected<OwningModule> HipModuleLoadDataEx(
     CurrentContext current, const void* image,
     llvm::ArrayRef<hipJitOption> options, llvm::ArrayRef<void*> option_values) {
diff --git a/backends/gpu/mlir_tests/rocm/module.mlir b/backends/gpu/mlir_tests/rocm/module.mlir
new file mode 100644
index 00000000000..df2d3be1332
--- /dev/null
+++ b/backends/gpu/mlir_tests/rocm/module.mlir
@@ -0,0 +1,94 @@
+// Copyright 2020 The TensorFlow Runtime Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// RUN: bef_executor_lite %s.bef | FileCheck %s
+
+// CHECK-LABEL: --- Running 'function_test'
+func @function_test() {
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+
+  // PTX for empty kernel.
+  // Typically module loading should be done at initialization time.
+  %module = tfrt_gpu.module.load %context {
+    data = "extern \"C\" __global__ void Kernel() { return; }\00"
+  }
+
+  %func = tfrt_gpu.module.get_function %module { name = "Kernel" }
+
+  tfrt.return
+}
+
+func @global_test() {
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+
+  // PTX for a module with a global symbol.
+  %module = tfrt_gpu.module.load %context {
+    data = "__device__ unsigned int Global[128];\00"
+  }
+
+  %global = tfrt_gpu.module.get_global %module { name = "Global" }
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'module_bad_data_test'
+func @module_bad_data_test() {
+  %ch2 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+
+  // expected-error @+1 {{hipErrorInvalidValue}}
+  %func = tfrt_gpu.module.load %context {
+    data = "invalid image\00"
+  }
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'function_bad_name_test'
+func @function_bad_name_test() {
+  %ch2 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+
+  %module = tfrt_gpu.module.load %context {
+    data = "extern \"C\" __global__ void Kernel() { return; }\00"
+  }
+
+  // expected-error @+1 {{hipErrorNotFound}}
+  %func = tfrt_gpu.module.get_function %module { name = "Foo\00" }
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'module_not_null_terminated_test'
+func @module_not_null_terminated_test() {
+  %ch2 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+
+  // expected-error @+1 {{data attribute must be null-terminated}}
+  %module = tfrt_gpu.module.load %context {
+    data = "not null-terminated"
+  }
+
+  tfrt.return
+}
diff --git a/backends/gpu/tools/stub_codegen/generate.sh b/backends/gpu/tools/stub_codegen/generate.sh
index f9430698bbd..e710ef9e6e6 100755
--- a/backends/gpu/tools/stub_codegen/generate.sh
+++ b/backends/gpu/tools/stub_codegen/generate.sh
@@ -21,7 +21,7 @@
 set -eux
 
 # Build the tools and generate the HIP header.
-bazel build --nocheck_visibility \
+bazel build --nocheck_visibility --config=gcc\
   //backends/gpu/tools/stub_codegen:header_codegen \
   //backends/gpu/tools/stub_codegen:impl_codegen
 
@@ -34,3 +34,10 @@ for API in "hip" "rocblas" "rocsolver" "miopen" "hipfft"; do
    ./bazel-bin/backends/gpu/tools/stub_codegen/impl_codegen \
        $(dirname $0)/$API.json | clang-format > $(printf $SRC_PATH $API)
 done
+
+# Hiprtc is currently rolled up in hip shared library. 
+# It is subject to change in future releases.
+./bazel-bin/backends/gpu/tools/stub_codegen/header_codegen \
+    $(dirname $0)/hiprtc.json | clang-format >> third_party/hip/hip_stub.h.inc
+./bazel-bin/backends/gpu/tools/stub_codegen/impl_codegen \
+    $(dirname $0)/hiprtc.json | clang-format >> third_party/hip/hip_stub.cc.inc
diff --git a/backends/gpu/tools/stub_codegen/hipfft.json b/backends/gpu/tools/stub_codegen/hipfft.json
index 36ab5cd5875..41b6c054af9 100644
--- a/backends/gpu/tools/stub_codegen/hipfft.json
+++ b/backends/gpu/tools/stub_codegen/hipfft.json
@@ -24,5 +24,5 @@
       "hipfftSetStream",
       "hipfftSetWorkArea",
       "hipfftSetAutoAllocation"
-   ],
+   ]
 }
diff --git a/backends/gpu/tools/stub_codegen/hiprtc.json b/backends/gpu/tools/stub_codegen/hiprtc.json
new file mode 100644
index 00000000000..3470f990588
--- /dev/null
+++ b/backends/gpu/tools/stub_codegen/hiprtc.json
@@ -0,0 +1,16 @@
+{
+   "header":"/opt/rocm-5.0.0/include/hip/hiprtc.h",
+   "extra_args":[
+      "-D__HIP_PLATFORM_AMD__",
+      "-I.",
+      "-I/opt/rocm-5.0.0/include/",
+      "-Ithird_party/llvm/llvm-project/clang/lib/Headers",
+      "-Ibazel-genfiles",
+      "-ferror-limit=0"
+   ],
+   "enums":[
+      "hiprtcResult"
+   ],
+   "functions":[
+   ]
+}
diff --git a/backends/gpu/tools/stub_codegen/miopen.json b/backends/gpu/tools/stub_codegen/miopen.json
index 956189c9272..63324e14f40 100644
--- a/backends/gpu/tools/stub_codegen/miopen.json
+++ b/backends/gpu/tools/stub_codegen/miopen.json
@@ -191,5 +191,5 @@
       "miopenSetDropoutDescriptor",
       "miopenDropoutForward",
       "miopenDropoutBackward"
-   ],
+   ]
 }
diff --git a/backends/gpu/tools/stub_codegen/rocblas.json b/backends/gpu/tools/stub_codegen/rocblas.json
index 0ebe21fd122..828f938d9b9 100644
--- a/backends/gpu/tools/stub_codegen/rocblas.json
+++ b/backends/gpu/tools/stub_codegen/rocblas.json
@@ -29,5 +29,5 @@
       "rocblas_dtrsm_batched",
       "rocblas_ctrsm_batched",
       "rocblas_ztrsm_batched"
-   ],
+   ]
 }
diff --git a/backends/gpu/tools/stub_codegen/rocsolver.json b/backends/gpu/tools/stub_codegen/rocsolver.json
index fe4fc6564a2..392a10acd9b 100644
--- a/backends/gpu/tools/stub_codegen/rocsolver.json
+++ b/backends/gpu/tools/stub_codegen/rocsolver.json
@@ -9,5 +9,5 @@
       "rocsolver_dpotrf",
       "rocsolver_cpotrf",
       "rocsolver_zpotrf"
-   ],
+   ]
 }
diff --git a/third_party/hip/hip_stub.h.inc b/third_party/hip/hip_stub.h.inc
index 7892e2e7995..119e3dd319b 100644
--- a/third_party/hip/hip_stub.h.inc
+++ b/third_party/hip/hip_stub.h.inc
@@ -462,3 +462,17 @@ enum hipDataType {
   HIP_C_32F = 4,
   HIP_C_64F = 5,
 };
+enum hiprtcResult {
+  HIPRTC_SUCCESS = 0,
+  HIPRTC_ERROR_OUT_OF_MEMORY = 1,
+  HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  HIPRTC_ERROR_INVALID_INPUT = 3,
+  HIPRTC_ERROR_INVALID_PROGRAM = 4,
+  HIPRTC_ERROR_INVALID_OPTION = 5,
+  HIPRTC_ERROR_COMPILATION = 6,
+  HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  HIPRTC_ERROR_INTERNAL_ERROR = 11,
+};
diff --git a/third_party/hip/hipfft_stub.cc.inc b/third_party/hip/hipfft_stub.cc.inc
index ffe4cebd3eb..f0296ba7cd8 100644
--- a/third_party/hip/hipfft_stub.cc.inc
+++ b/third_party/hip/hipfft_stub.cc.inc
@@ -1,3 +1,8 @@
+HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan) {
+  return DynamicCall<decltype(hipfftCreate), &hipfftCreate>("hipfftCreate",
+                                                            plan);
+}
+
 HIPFFT_EXPORT hipfftResult hipfftMakePlanMany64(
     hipfftHandle plan, int rank, long long int* n, long long int* inembed,
     long long int istride, long long int idist, long long int* onembed,
@@ -13,17 +18,19 @@ HIPFFT_EXPORT hipfftResult hipfftGetSize(hipfftHandle plan, size_t* workSize) {
                                                               plan, workSize);
 }
 
+HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan,
+                                                   int autoAllocate) {
+  return DynamicCall<decltype(hipfftSetAutoAllocation),
+                     &hipfftSetAutoAllocation>("hipfftSetAutoAllocation", plan,
+                                               autoAllocate);
+}
+
 HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan,
                                              void* workArea) {
   return DynamicCall<decltype(hipfftSetWorkArea), &hipfftSetWorkArea>(
       "hipfftSetWorkArea", plan, workArea);
 }
 
-HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, int enable) {
-  return DynamicCall<decltype(hipfftSetAutoAllocation), &hipfftSetAutoAllocation>(
-      "hipfftSetAutoAllocation", plan, enable);
-}
-
 HIPFFT_EXPORT hipfftResult hipfftExecC2C(hipfftHandle plan,
                                          hipfftComplex* idata,
                                          hipfftComplex* odata, int direction) {
@@ -72,11 +79,6 @@ HIPFFT_EXPORT hipfftResult hipfftSetStream(hipfftHandle plan,
       "hipfftSetStream", plan, stream);
 }
 
-HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan) {
-  return DynamicCall<decltype(hipfftCreate), &hipfftCreate>("hipfftCreate",
-                                                            plan);
-}
-
 HIPFFT_EXPORT hipfftResult hipfftDestroy(hipfftHandle plan) {
   return DynamicCall<decltype(hipfftDestroy), &hipfftDestroy>("hipfftDestroy",
                                                               plan);
diff --git a/third_party/hip/hipfft_stub.h.inc b/third_party/hip/hipfft_stub.h.inc
index fee4121a479..cc70b112986 100644
--- a/third_party/hip/hipfft_stub.h.inc
+++ b/third_party/hip/hipfft_stub.h.inc
@@ -32,17 +32,20 @@ typedef enum hipfftLibraryPropertyType_t {
   HIPFFT_PATCH_LEVEL,
 } hipfftLibraryPropertyType;
 
+HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan);
+
 HIPFFT_EXPORT hipfftResult hipfftMakePlanMany64(
     hipfftHandle plan, int rank, long long int* n, long long int* inembed,
     long long int istride, long long int idist, long long int* onembed,
     long long int ostride, long long int odist, hipfftType type,
-    long long int batch, size_t* work_size);
+    long long int batch, size_t* workSize);
 
 HIPFFT_EXPORT hipfftResult hipfftGetSize(hipfftHandle plan, size_t* workSize);
 
-HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan, void* workArea);
+HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan,
+                                                   int autoAllocate);
 
-HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, int enable);
+HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan, void* workArea);
 
 HIPFFT_EXPORT hipfftResult hipfftExecC2C(hipfftHandle plan,
                                          hipfftComplex* idata,
@@ -71,8 +74,6 @@ HIPFFT_EXPORT hipfftResult hipfftExecZ2D(hipfftHandle plan,
 HIPFFT_EXPORT hipfftResult hipfftSetStream(hipfftHandle plan,
                                            hipStream_t stream);
 
-HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan);
-
 HIPFFT_EXPORT hipfftResult hipfftDestroy(hipfftHandle plan);
 
 HIPFFT_EXPORT hipfftResult hipfftGetProperty(hipfftLibraryPropertyType type,