From b0b89af1feb0bc4e369d3150279d90d530cd8933 Mon Sep 17 00:00:00 2001 From: weihanmines Date: Wed, 19 Jan 2022 02:43:38 +0000 Subject: [PATCH 1/3] rocblas(mlir_tests) passed unit tests --- backends/gpu/lib/kernels/blas_kernels.cc | 33 +-- backends/gpu/mlir_tests/rocm/BUILD | 30 +++ backends/gpu/mlir_tests/rocm/blas.mlir | 193 ++++++++++++++++++ backends/gpu/tools/stub_codegen/hip.json | 5 +- backends/gpu/tools/stub_codegen/hipfft.json | 4 +- backends/gpu/tools/stub_codegen/miopen.json | 4 +- backends/gpu/tools/stub_codegen/rocblas.json | 4 +- .../gpu/tools/stub_codegen/rocsolver.json | 4 +- 8 files changed, 251 insertions(+), 26 deletions(-) create mode 100644 backends/gpu/mlir_tests/rocm/BUILD create mode 100644 backends/gpu/mlir_tests/rocm/blas.mlir diff --git a/backends/gpu/lib/kernels/blas_kernels.cc b/backends/gpu/lib/kernels/blas_kernels.cc index 042cb3713d2..29f2e072885 100644 --- a/backends/gpu/lib/kernels/blas_kernels.cc +++ b/backends/gpu/lib/kernels/blas_kernels.cc @@ -178,11 +178,11 @@ static Error BlasTrsmBatch( // TODO(hanbinyoon): Also support the ROCm function corresponding to // cublastrsmBatched. auto platform = handle->platform(); - if (platform != wrapper::Platform::CUDA) + if (platform != wrapper::Platform::CUDA && platform != wrapper::Platform::ROCm) return MakeStringError("Unsupported platform ", platform); - cudaDataType data_type = wrapper::BlasDataType::FromOpaqueValue(*dataType); - auto alpha_ptr = GetScalePointer(alpha, data_type); + rocblas_datatype data_type = wrapper::BlasDataType::FromOpaqueValue(*dataType); + auto alpha_ptr = GetScalePointer(alpha, static_cast(data_type)); if (!alpha_ptr) return alpha_ptr.takeError(); auto call = [&](auto dummy) { @@ -200,7 +200,7 @@ static Error BlasTrsmBatch( const T** a_array = const_cast(b_array + batchCount); auto side_mode = wrapper::BlasSideMode::FromOpaqueValue(*sideMode); - ptrdiff_t a_batch_stride = side_mode == CUBLAS_SIDE_LEFT ? m * m : n * n; + ptrdiff_t a_batch_stride = (side_mode == rocblas_side_left) ? m * m : n * n; ptrdiff_t b_batch_stride = m * n; const T* a_ptr = static_cast(A.pointer().raw(platform)); T* b_ptr = static_cast(B.pointer().raw(platform)); @@ -213,23 +213,24 @@ static Error BlasTrsmBatch( wrapper::Pointer b_array_ptr(b_array, platform); auto cast_alpha_ptr = static_cast>(*alpha_ptr); - return wrapper::CublasTrsmBatched( - *current, handle.get(), side_mode, - wrapper::BlasFillMode::FromOpaqueValue(*fillMode), - wrapper::BlasOperation::FromOpaqueValue(*trans), - wrapper::BlasDiagType::FromOpaqueValue(*diagType), m, n, cast_alpha_ptr, - a_array_ptr, heightA, b_array_ptr, heightB, batchCount); + + return wrapper::RocblasTrsmBatched( + *current, handle.get(), side_mode, + wrapper::BlasFillMode::FromOpaqueValue(*fillMode), + wrapper::BlasOperation::FromOpaqueValue(*trans), + wrapper::BlasDiagType::FromOpaqueValue(*diagType), m, n, cast_alpha_ptr, + a_array_ptr, heightA, b_array_ptr, heightB, batchCount); }; switch (data_type) { - case CUDA_R_32F: + case rocblas_datatype_f32_r: return call(float{}); - case CUDA_R_64F: + case rocblas_datatype_f64_r: return call(double{}); - case CUDA_C_32F: - return call(cuComplex{}); - case CUDA_C_64F: - return call(cuDoubleComplex{}); + case rocblas_datatype_f32_c: + return call(rocblas_float_complex{}); + case rocblas_datatype_f64_c: + return call(rocblas_double_complex{}); default: return MakeStringError("Unsupported data type ", data_type); } diff --git a/backends/gpu/mlir_tests/rocm/BUILD b/backends/gpu/mlir_tests/rocm/BUILD new file mode 100644 index 00000000000..b0284597910 --- /dev/null +++ b/backends/gpu/mlir_tests/rocm/BUILD @@ -0,0 +1,30 @@ +load("@tf_runtime//:build_defs.bzl", "if_oss") +load("@tf_runtime//tools:mlir_to_bef.bzl", "glob_tfrt_lit_tests") + +licenses(["notice"]) + +glob_tfrt_lit_tests( + data = [":test_utilities"], + # copybara:uncomment driver = "@tf_runtime//backends/gpu/mlir_tests:run_lit.sh", + tags_override = dict({ + file: ["requires-gpu-nvidia"] + for file in glob(["*.mlir"]) + }.items() + { + # Note: NCCL in the sandbox produces the wrong result in OSS. + "ccl.mlir": ["requires-gpu-nvidia:2"] + if_oss(["no-sandbox"]), + }.items()), + tfrt_translate = "@tf_runtime//backends/gpu:tfrt_gpu_translate", +) + +# Bundle together all of the test utilities that are used by tests. +filegroup( + name = "test_utilities", + testonly = True, + srcs = [ + "@llvm-project//llvm:FileCheck", + "@tf_runtime//backends/gpu:tfrt_gpu_executor", + "@tf_runtime//backends/gpu:tfrt_gpu_opt", + "@tf_runtime//backends/gpu:tfrt_gpu_translate", + "@tf_runtime//tools:bef_executor_lite", + ], +) diff --git a/backends/gpu/mlir_tests/rocm/blas.mlir b/backends/gpu/mlir_tests/rocm/blas.mlir new file mode 100644 index 00000000000..fb2969165a6 --- /dev/null +++ b/backends/gpu/mlir_tests/rocm/blas.mlir @@ -0,0 +1,193 @@ +// Copyright 2020 The TensorFlow Runtime Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// RUN: bef_executor_lite %s.bef | FileCheck %s + +// CHECK-LABEL: --- Running 'blas_axpy' +func @blas_axpy() { + %ch1 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + %allocator = tfrt_gpu.allocator.create %context + %stream = tfrt_gpu.stream.create %context + %blas = tfrt_gpu.blas.create %context + + %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats + %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes + + %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64] + %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1 + + %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32] + %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3 + %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32] + %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5 + %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer + + %stride = tfrt.constant.i32 1 + %alpha = tfrt.constant.f32 1.0 + %ch7 = tfrt_gpu.blas.axpy %blas, %stream, %buffer_length, %alpha, rocblas_datatype_f32_r, + %gpu_buffer_0, rocblas_datatype_f32_r, %stride, %gpu_buffer_1, rocblas_datatype_f32_r, %stride, + rocblas_datatype_f32_r, %ch6 + + %ch8 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_1, %stream, %ch7 : !ht.host_buffer, !tfrt_gpu.buffer + %ch9 = tfrt_gpu.stream.synchronize %stream, %ch8 + // CHECK: DenseHostTensor dtype = f32, shape = [2, 2] + // CHECK-SAME: values = [3.000000e+00, 5.000000e+00, 7.000000e+00, 9.000000e+00] + %ch10 = tfrt_dht.print_tensor %host_tensor, %ch9 + + tfrt.return +} + +// CHECK-LABEL: --- Running 'blas_gemm' +func @blas_gemm() { + %ch1 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + %allocator = tfrt_gpu.allocator.create %context + %stream = tfrt_gpu.stream.create %context + %blas = tfrt_gpu.blas.create %context + + %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats + %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes + + %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64] + %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1 + + %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32] + %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3 + %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32] + %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5 + %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch7 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch6 [0.0 : f32, 0.0 : f32, 0.0 : f32, 0.0 : f32] + %gpu_buffer_2 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch7 + %ch8 = tfrt_gpu.mem.copy %gpu_buffer_2, %host_buffer, %stream, %ch7 : !tfrt_gpu.buffer, !ht.host_buffer + + %dim = tfrt.constant.i32 2 + %alpha = tfrt.constant.f32 1.0 + %beta = tfrt.constant.f32 1.0 + %algo = tfrt_gpu.blas.gemm.algo rocblas_gemm_algo_standard + %ch9 = tfrt_gpu.blas.gemm %blas, %stream, + rocblas_operation_none, rocblas_operation_none, %dim, %dim, %dim, + %alpha, %gpu_buffer_0, rocblas_datatype_f32_r, %dim, + %gpu_buffer_1, rocblas_datatype_f32_r, %dim, %beta, + %gpu_buffer_2, rocblas_datatype_f32_r, %dim, + rocblas_datatype_f32_r, %algo, %ch8 + + %ch10 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_2, %stream, %ch9 : !ht.host_buffer, !tfrt_gpu.buffer + %ch11 = tfrt_gpu.stream.synchronize %stream, %ch10 + // CHECK: DenseHostTensor dtype = f32, shape = [2, 2] + // CHECK-SAME: values = [1.100000e+01, 1.600000e+01, 1.900000e+01, 2.800000e+01] + %ch12 = tfrt_dht.print_tensor %host_tensor, %ch11 + + tfrt.return +} + +// CHECK-LABEL: --- Running 'blas_gemm_batched' +func @blas_gemm_batched() { + %ch1 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + %allocator = tfrt_gpu.allocator.create %context + %stream = tfrt_gpu.stream.create %context + %blas = tfrt_gpu.blas.create %context + + %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats + %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes + + %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64] + %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1 + + %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32] + %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3 + %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32] + %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5 + %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch7 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch6 [0.0 : f32, 0.0 : f32, 0.0 : f32, 0.0 : f32] + %gpu_buffer_2 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch7 + %ch8 = tfrt_gpu.mem.copy %gpu_buffer_2, %host_buffer, %stream, %ch7 : !tfrt_gpu.buffer, !ht.host_buffer + + %dim = tfrt.constant.i32 2 + %type = tfrt.constant.i32 0 + %algo = tfrt_gpu.blas.gemm.algo rocblas_gemm_algo_standard + %alpha = tfrt.constant.f32 1.0 + %beta = tfrt.constant.f32 1.0 + %batch_count = tfrt.constant.i32 1 + %stride = tfrt.constant.i64 1 + %ch9 = tfrt_gpu.blas.gemm.batch %blas, %stream, + rocblas_operation_none, rocblas_operation_none, %dim, %dim, %dim, + %alpha, %gpu_buffer_0, rocblas_datatype_f32_r, %dim, %stride, + %gpu_buffer_1, rocblas_datatype_f32_r, %dim, %stride, %beta, + %gpu_buffer_2, rocblas_datatype_f32_r, %dim, %stride, %batch_count, + rocblas_datatype_f32_r, %algo, %ch8 + + %ch10 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_2, %stream, %ch9 : !ht.host_buffer, !tfrt_gpu.buffer + %ch11 = tfrt_gpu.stream.synchronize %stream, %ch10 + // CHECK: DenseHostTensor dtype = f32, shape = [2, 2] + // CHECK-SAME: values = [1.100000e+01, 1.600000e+01, 1.900000e+01, 2.800000e+01] + %ch12 = tfrt_dht.print_tensor %host_tensor, %ch11 + + tfrt.return +} + +// CHECK-LABEL: --- Running 'blas_trsm_batched' +func @blas_trsm_batched() { + %ch0 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + %allocator = tfrt_gpu.allocator.create %context + %stream = tfrt_gpu.stream.create %context + %blas = tfrt_gpu.blas.create %context + + %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes + + %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64] + %host_buffer, %ch1 = tfrt_dht.get_buffer %host_tensor, %ch0 + + %ch2 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch1 [1.0 : f32, 2.0 : f32, 0.0 : f32, 1.0 : f32] + %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch2 + %ch3 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch2 : !tfrt_gpu.buffer, !ht.host_buffer + + %ch4 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch3 [1.0 : f32, 4.0 : f32, 0.0 : f32, 0.0 : f32] + %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch4 + %ch5 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch4 : !tfrt_gpu.buffer, !ht.host_buffer + + %dim = tfrt.constant.i32 2 + %alpha = tfrt.constant.f32 1.0 + %batch_count = tfrt.constant.i32 1 + %ch6 = tfrt_gpu.blas.trsm.batch %blas, %stream, rocblas_side_left, + rocblas_fill_lower, rocblas_operation_none, rocblas_diagonal_unit, %dim, %dim, + rocblas_datatype_f32_r, %alpha, %gpu_buffer_0, %dim, %gpu_buffer_1, %dim, %batch_count, + %ch5 + + %ch7 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_1, %stream, %ch6 : !ht.host_buffer, !tfrt_gpu.buffer + %ch8 = tfrt_gpu.stream.synchronize %stream, %ch7 + // CHECK: DenseHostTensor dtype = f32, shape = [2, 2] + // CHECK-SAME: values = [1.000000e+00, 2.000000e+00, 0.000000e+00, 0.000000e+00] + %ch9 = tfrt_dht.print_tensor %host_tensor, %ch8 + + tfrt.return +} diff --git a/backends/gpu/tools/stub_codegen/hip.json b/backends/gpu/tools/stub_codegen/hip.json index ceb91551159..307fa302c38 100644 --- a/backends/gpu/tools/stub_codegen/hip.json +++ b/backends/gpu/tools/stub_codegen/hip.json @@ -1,9 +1,9 @@ { - "header":"/opt/rocm-4.2.0/include/hip/hip_runtime.h", + "header":"/opt/rocm-4.5.2/include/hip/hip_runtime.h", "extra_args":[ "-D__HIP_PLATFORM_AMD__", "-I.", - "-I/opt/rocm-4.2.0/include/", + "-I/opt/rocm-4.5.2/include/", "-Ithird_party/llvm/llvm-project/clang/lib/Headers", "-Ibazel-genfiles", "-ferror-limit=0" @@ -13,6 +13,7 @@ "hipDeviceAttribute_t", "hipJitOption", "hipLimit_t", + "hipDataType", "hipMemoryType", "hipMemcpyKind", "hipFunction_attribute", diff --git a/backends/gpu/tools/stub_codegen/hipfft.json b/backends/gpu/tools/stub_codegen/hipfft.json index d9d9179b31d..802b86c55c8 100644 --- a/backends/gpu/tools/stub_codegen/hipfft.json +++ b/backends/gpu/tools/stub_codegen/hipfft.json @@ -1,8 +1,8 @@ { - "header":"/opt/rocm-4.2.0/hipfft/include/hipfft.h", + "header":"/opt/rocm-4.5.2/hipfft/include/hipfft.h", "extra_args":[ "-I.", - "-I/opt/rocm-4.2.0/include/" + "-I/opt/rocm-4.5.2/include/" ], "enums":[ "hipfftLibraryPropertyType", diff --git a/backends/gpu/tools/stub_codegen/miopen.json b/backends/gpu/tools/stub_codegen/miopen.json index 57c9983db12..25277eb6ed0 100644 --- a/backends/gpu/tools/stub_codegen/miopen.json +++ b/backends/gpu/tools/stub_codegen/miopen.json @@ -1,8 +1,8 @@ { - "header":"/opt/rocm-4.2.0/include/miopen/miopen.h", + "header":"/opt/rocm-4.5.2/include/miopen/miopen.h", "extra_args":[ "-I.", - "-I/opt/rocm-4.2.0/include/" + "-I/opt/rocm-4.5.2/include/" ], "enums":[ "miopenStatus_t", diff --git a/backends/gpu/tools/stub_codegen/rocblas.json b/backends/gpu/tools/stub_codegen/rocblas.json index c071842474b..8d8c22966c8 100644 --- a/backends/gpu/tools/stub_codegen/rocblas.json +++ b/backends/gpu/tools/stub_codegen/rocblas.json @@ -1,8 +1,8 @@ { - "header":"/opt/rocm-4.2.0/include/rocblas.h", + "header":"/opt/rocm-4.5.2/include/rocblas.h", "extra_args":[ "-I.", - "-I/opt/rocm-4.2.0/include/" + "-I/opt/rocm-4.5.2/include/" ], "enums":[ "rocblas_operation", diff --git a/backends/gpu/tools/stub_codegen/rocsolver.json b/backends/gpu/tools/stub_codegen/rocsolver.json index c72ba7a891c..053b8895b64 100644 --- a/backends/gpu/tools/stub_codegen/rocsolver.json +++ b/backends/gpu/tools/stub_codegen/rocsolver.json @@ -1,8 +1,8 @@ { - "header":"/opt/rocm-4.2.0/include/rocsolver.h", + "header":"/opt/rocm-4.5.2/include/rocsolver.h", "extra_args":[ "-I.", - "-I/opt/rocm-4.2.0/include/" + "-I/opt/rocm-4.5.2/include/" ], "functions":[ "rocsolver_spotrf", From 85633100c15b0091d42fb58d446542157f88264b Mon Sep 17 00:00:00 2001 From: weihanmines Date: Wed, 19 Jan 2022 16:08:43 +0000 Subject: [PATCH 2/3] inc files added --- third_party/hip/hip_stub.cc.inc | 33 ++++++------ third_party/hip/hip_stub.h.inc | 34 +++++++++--- third_party/hip/miopen_stub.h.inc | 1 + third_party/hip/rocblas_stub.cc.inc | 84 ++++++++++++++--------------- third_party/hip/rocblas_stub.h.inc | 53 +++++++++--------- 5 files changed, 113 insertions(+), 92 deletions(-) diff --git a/third_party/hip/hip_stub.cc.inc b/third_party/hip/hip_stub.cc.inc index fb17e9af57e..1a0431d3ca0 100644 --- a/third_party/hip/hip_stub.cc.inc +++ b/third_party/hip/hip_stub.cc.inc @@ -12,16 +12,6 @@ hipError_t hipRuntimeGetVersion(int* runtimeVersion) { "hipRuntimeGetVersion", runtimeVersion); } -hipError_t hipGetLastError(void) { - return DynamicCall( - "hipGetLastError"); -} - -hipError_t hipPeekAtLastError(void) { - return DynamicCall( - "hipPeekAtLastError"); -} - hipError_t hipDeviceGet(hipDevice_t* device, int ordinal) { return DynamicCall("hipDeviceGet", device, ordinal); @@ -73,6 +63,16 @@ hipError_t hipDeviceGetLimit(size_t* pValue, enum hipLimit_t limit) { "hipDeviceGetLimit", pValue, limit); } +hipError_t hipGetLastError(void) { + return DynamicCall( + "hipGetLastError"); +} + +hipError_t hipPeekAtLastError(void) { + return DynamicCall( + "hipPeekAtLastError"); +} + hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) { return DynamicCall("hipStreamCreateWithFlags", @@ -209,6 +209,12 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, sizeBytes, kind); } +hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, + hipModule_t hmod, const char* name) { + return DynamicCall( + "hipModuleGetGlobal", dptr, bytes, hmod, name); +} + hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream __dparm(0)) { return DynamicCall( @@ -376,13 +382,6 @@ hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, "hipModuleGetFunction", function, module, kname); } -hipError_t hipModuleGetGlobal(void** ptr, size_t* bytes, hipModule_t module, - const char* kname) { - return DynamicCall( - "hipModuleGetGlobal", ptr, bytes, module, kname); -} - - hipError_t hipFuncGetAttributes(struct hipFuncAttributes* attr, const void* func) { return DynamicCall( diff --git a/third_party/hip/hip_stub.h.inc b/third_party/hip/hip_stub.h.inc index fc8d5fae4ff..c7dcedec42b 100644 --- a/third_party/hip/hip_stub.h.inc +++ b/third_party/hip/hip_stub.h.inc @@ -62,12 +62,22 @@ enum hipError_t { hipErrorPeerAccessAlreadyEnabled = 704, hipErrorPeerAccessNotEnabled = 705, hipErrorSetOnActiveProcess = 708, + hipErrorContextIsDestroyed = 709, hipErrorAssert = 710, hipErrorHostMemoryAlreadyRegistered = 712, hipErrorHostMemoryNotRegistered = 713, hipErrorLaunchFailure = 719, hipErrorCooperativeLaunchTooLarge = 720, hipErrorNotSupported = 801, + hipErrorStreamCaptureUnsupported = 900, + hipErrorStreamCaptureInvalidated = 901, + hipErrorStreamCaptureMerge = 902, + hipErrorStreamCaptureUnmatched = 903, + hipErrorStreamCaptureUnjoined = 904, + hipErrorStreamCaptureIsolation = 905, + hipErrorStreamCaptureImplicit = 906, + hipErrorCapturedEvent = 907, + hipErrorStreamCaptureWrongThread = 908, hipErrorUnknown = 999, hipErrorRuntimeMemory = 1052, hipErrorRuntimeOther = 1053, @@ -153,6 +163,7 @@ enum hipFunction_attribute { }; enum hipLimit_t { + hipLimitPrintfFifoSize = 0x01, hipLimitMallocHeapSize = 0x02, }; @@ -196,10 +207,6 @@ hipError_t hipDriverGetVersion(int* driverVersion); hipError_t hipRuntimeGetVersion(int* runtimeVersion); -hipError_t hipGetLastError(void); - -hipError_t hipPeekAtLastError(void); - hipError_t hipDeviceGet(hipDevice_t* device, int ordinal); hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device); @@ -221,6 +228,10 @@ hipError_t hipGetDeviceProperties(hipDeviceProp_t* prop, int deviceId); hipError_t hipDeviceGetLimit(size_t* pValue, enum hipLimit_t limit); +hipError_t hipGetLastError(void); + +hipError_t hipPeekAtLastError(void); + hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags); hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, @@ -278,6 +289,9 @@ hipError_t hipHostFree(void* ptr); hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); +hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, + hipModule_t hmod, const char* name); + hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream __dparm(0)); @@ -353,9 +367,6 @@ hipError_t hipModuleUnload(hipModule_t module); hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, const char* kname); -hipError_t hipModuleGetGlobal(void** ptr, size_t* bytes, hipModule_t module, - const char* kname); - hipError_t hipFuncGetAttributes(struct hipFuncAttributes* attr, const void* func); @@ -385,3 +396,12 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, const void* f, size_t dynSharedMemPerBlk, int blockSizeLimit); + +enum hipDataType { + HIP_R_16F = 2, + HIP_R_32F = 0, + HIP_R_64F = 1, + HIP_C_16F = 6, + HIP_C_32F = 4, + HIP_C_64F = 5, +}; diff --git a/third_party/hip/miopen_stub.h.inc b/third_party/hip/miopen_stub.h.inc index b3794d392aa..2ccf9e06b54 100644 --- a/third_party/hip/miopen_stub.h.inc +++ b/third_party/hip/miopen_stub.h.inc @@ -46,6 +46,7 @@ typedef enum { miopenInt8 = 3, miopenInt8x4 = 4, miopenBFloat16 = 5, + miopenDouble = 6, } miopenDataType_t; typedef enum { diff --git a/third_party/hip/rocblas_stub.cc.inc b/third_party/hip/rocblas_stub.cc.inc index 40c5d05975c..3c10e0c723d 100644 --- a/third_party/hip/rocblas_stub.cc.inc +++ b/third_party/hip/rocblas_stub.cc.inc @@ -34,48 +34,6 @@ ROCBLAS_EXPORT rocblas_status rocblas_get_pointer_mode( handle, pointer_mode); } -ROCBLAS_EXPORT rocblas_status rocblas_gemm_ex( - rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, - rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha, - const void* a, rocblas_datatype a_type, rocblas_int lda, const void* b, - rocblas_datatype b_type, rocblas_int ldb, const void* beta, const void* c, - rocblas_datatype c_type, rocblas_int ldc, void* d, rocblas_datatype d_type, - rocblas_int ldd, rocblas_datatype compute_type, rocblas_gemm_algo algo, - int32_t solution_index, uint32_t flags) { - return DynamicCall( - "rocblas_gemm_ex", handle, transA, transB, m, n, k, alpha, a, a_type, lda, - b, b_type, ldb, beta, c, c_type, ldc, d, d_type, ldd, compute_type, algo, - solution_index, flags); -} - -ROCBLAS_EXPORT rocblas_status rocblas_gemm_strided_batched_ex( - rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, - rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha, - const void* a, rocblas_datatype a_type, rocblas_int lda, - rocblas_stride stride_a, const void* b, rocblas_datatype b_type, - rocblas_int ldb, rocblas_stride stride_b, const void* beta, const void* c, - rocblas_datatype c_type, rocblas_int ldc, rocblas_stride stride_c, void* d, - rocblas_datatype d_type, rocblas_int ldd, rocblas_stride stride_d, - rocblas_int batch_count, rocblas_datatype compute_type, - rocblas_gemm_algo algo, int32_t solution_index, uint32_t flags) { - return DynamicCall( - "rocblas_gemm_strided_batched_ex", handle, transA, transB, m, n, k, alpha, - a, a_type, lda, stride_a, b, b_type, ldb, stride_b, beta, c, c_type, ldc, - stride_c, d, d_type, ldd, stride_d, batch_count, compute_type, algo, - solution_index, flags); -} - -ROCBLAS_EXPORT rocblas_status rocblas_axpy_ex( - rocblas_handle handle, rocblas_int n, const void* alpha, - rocblas_datatype alpha_type, const void* x, rocblas_datatype x_type, - rocblas_int incx, void* y, rocblas_datatype y_type, rocblas_int incy, - rocblas_datatype execution_type) { - return DynamicCall( - "rocblas_axpy_ex", handle, n, alpha, alpha_type, x, x_type, incx, y, - y_type, incy, execution_type); -} - ROCBLAS_EXPORT rocblas_status rocblas_strsm_batched( rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, @@ -120,3 +78,45 @@ ROCBLAS_EXPORT rocblas_status rocblas_ztrsm_batched( "rocblas_ztrsm_batched", handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, batch_count); } + +ROCBLAS_EXPORT rocblas_status rocblas_gemm_ex( + rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, + rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha, + const void* a, rocblas_datatype a_type, rocblas_int lda, const void* b, + rocblas_datatype b_type, rocblas_int ldb, const void* beta, const void* c, + rocblas_datatype c_type, rocblas_int ldc, void* d, rocblas_datatype d_type, + rocblas_int ldd, rocblas_datatype compute_type, rocblas_gemm_algo algo, + int32_t solution_index, uint32_t flags) { + return DynamicCall( + "rocblas_gemm_ex", handle, transA, transB, m, n, k, alpha, a, a_type, lda, + b, b_type, ldb, beta, c, c_type, ldc, d, d_type, ldd, compute_type, algo, + solution_index, flags); +} + +ROCBLAS_EXPORT rocblas_status rocblas_gemm_strided_batched_ex( + rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, + rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha, + const void* a, rocblas_datatype a_type, rocblas_int lda, + rocblas_stride stride_a, const void* b, rocblas_datatype b_type, + rocblas_int ldb, rocblas_stride stride_b, const void* beta, const void* c, + rocblas_datatype c_type, rocblas_int ldc, rocblas_stride stride_c, void* d, + rocblas_datatype d_type, rocblas_int ldd, rocblas_stride stride_d, + rocblas_int batch_count, rocblas_datatype compute_type, + rocblas_gemm_algo algo, int32_t solution_index, uint32_t flags) { + return DynamicCall( + "rocblas_gemm_strided_batched_ex", handle, transA, transB, m, n, k, alpha, + a, a_type, lda, stride_a, b, b_type, ldb, stride_b, beta, c, c_type, ldc, + stride_c, d, d_type, ldd, stride_d, batch_count, compute_type, algo, + solution_index, flags); +} + +ROCBLAS_EXPORT rocblas_status rocblas_axpy_ex( + rocblas_handle handle, rocblas_int n, const void* alpha, + rocblas_datatype alpha_type, const void* x, rocblas_datatype x_type, + rocblas_int incx, void* y, rocblas_datatype y_type, rocblas_int incy, + rocblas_datatype execution_type) { + return DynamicCall( + "rocblas_axpy_ex", handle, n, alpha, alpha_type, x, x_type, incx, y, + y_type, incy, execution_type); +} diff --git a/third_party/hip/rocblas_stub.h.inc b/third_party/hip/rocblas_stub.h.inc index 7f8318362b6..eddf6ec949c 100644 --- a/third_party/hip/rocblas_stub.h.inc +++ b/third_party/hip/rocblas_stub.h.inc @@ -71,6 +71,7 @@ typedef enum rocblas_gemm_algo_ { typedef enum rocblas_gemm_flags_ { rocblas_gemm_flags_none = 0x0, rocblas_gemm_flags_pack_int8x4 = 0x1, + rocblas_gemm_flags_use_cu_efficiency = 0x2, } rocblas_gemm_flags; ROCBLAS_EXPORT rocblas_status rocblas_create_handle(rocblas_handle* handle); @@ -89,32 +90,6 @@ ROCBLAS_EXPORT rocblas_status rocblas_set_pointer_mode( ROCBLAS_EXPORT rocblas_status rocblas_get_pointer_mode( rocblas_handle handle, rocblas_pointer_mode* pointer_mode); -ROCBLAS_EXPORT rocblas_status rocblas_gemm_ex( - rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, - rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha, - const void* a, rocblas_datatype a_type, rocblas_int lda, const void* b, - rocblas_datatype b_type, rocblas_int ldb, const void* beta, const void* c, - rocblas_datatype c_type, rocblas_int ldc, void* d, rocblas_datatype d_type, - rocblas_int ldd, rocblas_datatype compute_type, rocblas_gemm_algo algo, - int32_t solution_index, uint32_t flags); - -ROCBLAS_EXPORT rocblas_status rocblas_gemm_strided_batched_ex( - rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, - rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha, - const void* a, rocblas_datatype a_type, rocblas_int lda, - rocblas_stride stride_a, const void* b, rocblas_datatype b_type, - rocblas_int ldb, rocblas_stride stride_b, const void* beta, const void* c, - rocblas_datatype c_type, rocblas_int ldc, rocblas_stride stride_c, void* d, - rocblas_datatype d_type, rocblas_int ldd, rocblas_stride stride_d, - rocblas_int batch_count, rocblas_datatype compute_type, - rocblas_gemm_algo algo, int32_t solution_index, uint32_t flags); - -ROCBLAS_EXPORT rocblas_status rocblas_axpy_ex( - rocblas_handle handle, rocblas_int n, const void* alpha, - rocblas_datatype alpha_type, const void* x, rocblas_datatype x_type, - rocblas_int incx, void* y, rocblas_datatype y_type, rocblas_int incy, - rocblas_datatype execution_type); - ROCBLAS_EXPORT rocblas_status rocblas_strsm_batched( rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, @@ -142,3 +117,29 @@ ROCBLAS_EXPORT rocblas_status rocblas_ztrsm_batched( const rocblas_double_complex* const A[], rocblas_int lda, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_int batch_count); + +ROCBLAS_EXPORT rocblas_status rocblas_gemm_ex( + rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, + rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha, + const void* a, rocblas_datatype a_type, rocblas_int lda, const void* b, + rocblas_datatype b_type, rocblas_int ldb, const void* beta, const void* c, + rocblas_datatype c_type, rocblas_int ldc, void* d, rocblas_datatype d_type, + rocblas_int ldd, rocblas_datatype compute_type, rocblas_gemm_algo algo, + int32_t solution_index, uint32_t flags); + +ROCBLAS_EXPORT rocblas_status rocblas_gemm_strided_batched_ex( + rocblas_handle handle, rocblas_operation transA, rocblas_operation transB, + rocblas_int m, rocblas_int n, rocblas_int k, const void* alpha, + const void* a, rocblas_datatype a_type, rocblas_int lda, + rocblas_stride stride_a, const void* b, rocblas_datatype b_type, + rocblas_int ldb, rocblas_stride stride_b, const void* beta, const void* c, + rocblas_datatype c_type, rocblas_int ldc, rocblas_stride stride_c, void* d, + rocblas_datatype d_type, rocblas_int ldd, rocblas_stride stride_d, + rocblas_int batch_count, rocblas_datatype compute_type, + rocblas_gemm_algo algo, int32_t solution_index, uint32_t flags); + +ROCBLAS_EXPORT rocblas_status rocblas_axpy_ex( + rocblas_handle handle, rocblas_int n, const void* alpha, + rocblas_datatype alpha_type, const void* x, rocblas_datatype x_type, + rocblas_int incx, void* y, rocblas_datatype y_type, rocblas_int incy, + rocblas_datatype execution_type); From db2ced44941cd056047bd5f706a05ce744749d4a Mon Sep 17 00:00:00 2001 From: weihanmines Date: Mon, 7 Mar 2022 23:39:37 +0000 Subject: [PATCH 3/3] add ptx alternatives in ROCm --- .../include/tfrt/gpu/wrapper/hip_forwards.h | 2 + .../gpu/include/tfrt/gpu/wrapper/hip_stub.h | 23 +++++ .../include/tfrt/gpu/wrapper/hip_wrapper.h | 3 + backends/gpu/lib/kernels/driver_kernels.cc | 1 - backends/gpu/lib/wrapper/driver_wrapper.cc | 2 +- backends/gpu/lib/wrapper/hip_stub.cc | 89 ++++++++++++++++++ backends/gpu/lib/wrapper/hip_wrapper.cc | 45 +++++++++ backends/gpu/mlir_tests/rocm/module.mlir | 94 +++++++++++++++++++ backends/gpu/tools/stub_codegen/generate.sh | 9 +- backends/gpu/tools/stub_codegen/hipfft.json | 2 +- backends/gpu/tools/stub_codegen/hiprtc.json | 16 ++++ backends/gpu/tools/stub_codegen/miopen.json | 2 +- backends/gpu/tools/stub_codegen/rocblas.json | 2 +- .../gpu/tools/stub_codegen/rocsolver.json | 2 +- third_party/hip/hip_stub.h.inc | 14 +++ third_party/hip/hipfft_stub.cc.inc | 22 +++-- third_party/hip/hipfft_stub.h.inc | 11 ++- 17 files changed, 317 insertions(+), 22 deletions(-) create mode 100644 backends/gpu/mlir_tests/rocm/module.mlir create mode 100644 backends/gpu/tools/stub_codegen/hiprtc.json diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h index dc3e71ff986..b400636a090 100644 --- a/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h +++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h @@ -59,6 +59,8 @@ using ncclComm_t = struct ncclComm *; // Forward declaration of hipFFT types. using hipfftHandle = struct hipfftHandle_t *; +// Forward declaration of hiprtcProgram +using hiprtcProgram = struct _hiprtcProgram *; // Enums for corresponding #defines in the hipFFT headers. enum hipfftDirection_t : int { HIPFFT_FORWARD = -1, diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h index 3908eed5205..646aa532827 100644 --- a/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h +++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h @@ -52,6 +52,29 @@ extern "C" { const char* hipGetErrorName(hipError_t hip_error); const char* hipGetErrorString(hipError_t hip_error); +const char *hiprtcGetErrorString(hiprtcResult result); +hiprtcResult hiprtcVersion(int* major, int* minor); +hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression); +hiprtcResult hiprtcCompileProgram( + hiprtcProgram prog, + int numOptions, + const char** options); +hiprtcResult hiprtcCreateProgram( + hiprtcProgram* prog, + const char* src, + const char* name, + int numberHeaders, + char** headers, + const char** includeNames); +hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog); +hiprtcResult hiprtcGetLoweredName( + hiprtcProgram prog, + const char* name_expression, + const char** lowered_name); +hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log); +hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet); +hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code); +hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet); // Enums for corresponding #defines in the HIP headers. enum hipDeviceFlags_t { diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h index 98a3f0bc0fd..92feac7d949 100644 --- a/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h +++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h @@ -29,6 +29,7 @@ namespace gpu { namespace wrapper { raw_ostream& Print(raw_ostream& os, hipError_t error); +raw_ostream& Print(raw_ostream& os, hiprtcResult result); namespace internal { template <> @@ -162,6 +163,8 @@ llvm::Error HipMemsetD32Async(CurrentContext current, Pointer dst, llvm::Expected HipModuleLoadData(CurrentContext current, const void* image); +llvm::Expected HipRTCModuleLoadData(CurrentContext current, + const void* image); llvm::Expected HipModuleLoadDataEx( CurrentContext current, const void* image, llvm::ArrayRef options, llvm::ArrayRef option_values); diff --git a/backends/gpu/lib/kernels/driver_kernels.cc b/backends/gpu/lib/kernels/driver_kernels.cc index 6feb245ceff..8a5d40c204e 100644 --- a/backends/gpu/lib/kernels/driver_kernels.cc +++ b/backends/gpu/lib/kernels/driver_kernels.cc @@ -320,7 +320,6 @@ static Expected GpuModuleLoad(Argument context, MakeStringError("GPU JIT error log: ", error_log)); } #endif - return GpuModule(context.ValueRef(), std::move(*module)); } diff --git a/backends/gpu/lib/wrapper/driver_wrapper.cc b/backends/gpu/lib/wrapper/driver_wrapper.cc index ef4574980da..7e11316d95a 100644 --- a/backends/gpu/lib/wrapper/driver_wrapper.cc +++ b/backends/gpu/lib/wrapper/driver_wrapper.cc @@ -852,7 +852,7 @@ llvm::Expected ModuleLoadData(CurrentContext current, case Platform::CUDA: return CuModuleLoadData(current, image); case Platform::ROCm: - return HipModuleLoadData(current, image); + return HipRTCModuleLoadData(current, image); default: return InvalidPlatform(platform); } diff --git a/backends/gpu/lib/wrapper/hip_stub.cc b/backends/gpu/lib/wrapper/hip_stub.cc index 68a650d3bdf..539738d9339 100644 --- a/backends/gpu/lib/wrapper/hip_stub.cc +++ b/backends/gpu/lib/wrapper/hip_stub.cc @@ -61,3 +61,92 @@ const char *hipGetErrorString(hipError_t hip_error) { if (!func_ptr) return "FAILED_TO_LOAD_FUNCTION_SYMBOL"; return func_ptr(hip_error); } + +const char *hiprtcGetErrorString(hiprtcResult result) { + static auto func_ptr = + GetFunctionPointer("hiprtcGetErrorString", hiprtcGetErrorString); + if (!func_ptr) return "FAILED_TO_LOAD_FUNCTION_SYMBOL"; + return func_ptr(result); +} + +hiprtcResult hiprtcVersion(int* major, int* minor){ + static auto func_ptr = + GetFunctionPointer("hiprtcVersion", hiprtcVersion); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(major, minor); +} + +hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression){ + static auto func_ptr = + GetFunctionPointer("hiprtcAddNameExpression", hiprtcAddNameExpression); + if (!func_ptr) return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID; + return func_ptr(prog, name_expression); +} + +hiprtcResult hiprtcCompileProgram( + hiprtcProgram prog, + int numOptions, + const char** options){ + static auto func_ptr = + GetFunctionPointer("hiprtcCompileProgram", hiprtcCompileProgram); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, numOptions, options); +} + +hiprtcResult hiprtcCreateProgram( + hiprtcProgram* prog, + const char* src, + const char* name, + int numberHeaders, + char** headers, + const char** includeNames){ + static auto func_ptr = + GetFunctionPointer("hiprtcCreateProgram", hiprtcCreateProgram); + if (!func_ptr) return HIPRTC_ERROR_PROGRAM_CREATION_FAILURE; + return func_ptr(prog, src, name, numberHeaders, headers, includeNames); +} + +hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog){ + static auto func_ptr = + GetFunctionPointer("hiprtcDestroyProgram", hiprtcDestroyProgram); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog); +} + +hiprtcResult hiprtcGetLoweredName( + hiprtcProgram prog, + const char* name_expression, + const char** lowered_name){ + static auto func_ptr = + GetFunctionPointer("hiprtcGetLoweredName", hiprtcGetLoweredName); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, name_expression, lowered_name); +} + +hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log){ + static auto func_ptr = + GetFunctionPointer("hiprtcGetProgramLog", hiprtcGetProgramLog); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, log); +} + +hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet){ + static auto func_ptr = + GetFunctionPointer("hiprtcGetProgramLogSize", hiprtcGetProgramLogSize); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, logSizeRet); +} + +hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code){ + static auto func_ptr = + GetFunctionPointer("hiprtcGetCode", hiprtcGetCode); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, code); +} + +hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet){ + static auto func_ptr = + GetFunctionPointer("hiprtcGetCodeSize", hiprtcGetCodeSize); + if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR; + return func_ptr(prog, codeSizeRet); +} diff --git a/backends/gpu/lib/wrapper/hip_wrapper.cc b/backends/gpu/lib/wrapper/hip_wrapper.cc index 052c06f1f75..cf37c4ef03e 100644 --- a/backends/gpu/lib/wrapper/hip_wrapper.cc +++ b/backends/gpu/lib/wrapper/hip_wrapper.cc @@ -37,6 +37,12 @@ llvm::raw_ostream& Print(llvm::raw_ostream& os, hipError_t error) { return os; } +llvm::raw_ostream& Print(llvm::raw_ostream& os, hiprtcResult result) { + const char* msg = hiprtcGetErrorString(result); + if (msg != nullptr) os << "hiprtc Error: (" << msg << ")"; + return os; +} + // Convert wrapper types to HIP types. static hipDevice_t ToRocm(Device device) { return device.id(Platform::ROCm); } @@ -540,6 +546,45 @@ llvm::Expected HipModuleLoadData(CurrentContext current, return OwningModule(module); } +llvm::Expected HipRTCModuleLoadData(CurrentContext current, + const void* image) { + CheckHipContext(current); + hiprtcProgram prog; + //auto img = reinterpret_cast(const_cast(image)); + auto kernel = static_cast(image); + std::string kname(kernel); + kname += ".cu"; + RETURN_IF_ERROR(hiprtcCreateProgram(&prog, + kernel, + kname.c_str(), + 0, + nullptr, + nullptr + )); + hiprtcResult compileResult = hiprtcCompileProgram(prog, 0, nullptr); + if (compileResult != HIPRTC_SUCCESS) { + size_t logSize; + hiprtcGetProgramLogSize(prog, &logSize); + if (logSize) { + std::string log(logSize, '\0'); + hiprtcGetProgramLog(prog, &log[0]); + MakeStringError(log.c_str()); + } + } + + size_t code_size; + RETURN_IF_ERROR(hiprtcGetCodeSize(prog, &code_size)); + std::vector code(code_size); + RETURN_IF_ERROR(hiprtcGetCode(prog, code.data())); + RETURN_IF_ERROR(hiprtcDestroyProgram(&prog)); + + hipModule_t module; + RETURN_IF_ERROR(hipModuleLoadData(&module, code.data())); + + NotifyResourceCreated(ResourceType::kModule, module); + return OwningModule(module); +} + llvm::Expected HipModuleLoadDataEx( CurrentContext current, const void* image, llvm::ArrayRef options, llvm::ArrayRef option_values) { diff --git a/backends/gpu/mlir_tests/rocm/module.mlir b/backends/gpu/mlir_tests/rocm/module.mlir new file mode 100644 index 00000000000..df2d3be1332 --- /dev/null +++ b/backends/gpu/mlir_tests/rocm/module.mlir @@ -0,0 +1,94 @@ +// Copyright 2020 The TensorFlow Runtime Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// RUN: bef_executor_lite %s.bef | FileCheck %s + +// CHECK-LABEL: --- Running 'function_test' +func @function_test() { + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + + // PTX for empty kernel. + // Typically module loading should be done at initialization time. + %module = tfrt_gpu.module.load %context { + data = "extern \"C\" __global__ void Kernel() { return; }\00" + } + + %func = tfrt_gpu.module.get_function %module { name = "Kernel" } + + tfrt.return +} + +func @global_test() { + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + + // PTX for a module with a global symbol. + %module = tfrt_gpu.module.load %context { + data = "__device__ unsigned int Global[128];\00" + } + + %global = tfrt_gpu.module.get_global %module { name = "Global" } + + tfrt.return +} + +// CHECK-LABEL: --- Running 'module_bad_data_test' +func @module_bad_data_test() { + %ch2 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + + // expected-error @+1 {{hipErrorInvalidValue}} + %func = tfrt_gpu.module.load %context { + data = "invalid image\00" + } + + tfrt.return +} + +// CHECK-LABEL: --- Running 'function_bad_name_test' +func @function_bad_name_test() { + %ch2 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + + %module = tfrt_gpu.module.load %context { + data = "extern \"C\" __global__ void Kernel() { return; }\00" + } + + // expected-error @+1 {{hipErrorNotFound}} + %func = tfrt_gpu.module.get_function %module { name = "Foo\00" } + + tfrt.return +} + +// CHECK-LABEL: --- Running 'module_not_null_terminated_test' +func @module_not_null_terminated_test() { + %ch2 = tfrt.new.chain + %ordinal = tfrt.constant.i32 0 + %device = tfrt_gpu.device.get ROCm, %ordinal + %context = tfrt_gpu.context.create %device + + // expected-error @+1 {{data attribute must be null-terminated}} + %module = tfrt_gpu.module.load %context { + data = "not null-terminated" + } + + tfrt.return +} diff --git a/backends/gpu/tools/stub_codegen/generate.sh b/backends/gpu/tools/stub_codegen/generate.sh index f9430698bbd..e710ef9e6e6 100755 --- a/backends/gpu/tools/stub_codegen/generate.sh +++ b/backends/gpu/tools/stub_codegen/generate.sh @@ -21,7 +21,7 @@ set -eux # Build the tools and generate the HIP header. -bazel build --nocheck_visibility \ +bazel build --nocheck_visibility --config=gcc\ //backends/gpu/tools/stub_codegen:header_codegen \ //backends/gpu/tools/stub_codegen:impl_codegen @@ -34,3 +34,10 @@ for API in "hip" "rocblas" "rocsolver" "miopen" "hipfft"; do ./bazel-bin/backends/gpu/tools/stub_codegen/impl_codegen \ $(dirname $0)/$API.json | clang-format > $(printf $SRC_PATH $API) done + +# Hiprtc is currently rolled up in hip shared library. +# It is subject to change in future releases. +./bazel-bin/backends/gpu/tools/stub_codegen/header_codegen \ + $(dirname $0)/hiprtc.json | clang-format >> third_party/hip/hip_stub.h.inc +./bazel-bin/backends/gpu/tools/stub_codegen/impl_codegen \ + $(dirname $0)/hiprtc.json | clang-format >> third_party/hip/hip_stub.cc.inc diff --git a/backends/gpu/tools/stub_codegen/hipfft.json b/backends/gpu/tools/stub_codegen/hipfft.json index 36ab5cd5875..41b6c054af9 100644 --- a/backends/gpu/tools/stub_codegen/hipfft.json +++ b/backends/gpu/tools/stub_codegen/hipfft.json @@ -24,5 +24,5 @@ "hipfftSetStream", "hipfftSetWorkArea", "hipfftSetAutoAllocation" - ], + ] } diff --git a/backends/gpu/tools/stub_codegen/hiprtc.json b/backends/gpu/tools/stub_codegen/hiprtc.json new file mode 100644 index 00000000000..3470f990588 --- /dev/null +++ b/backends/gpu/tools/stub_codegen/hiprtc.json @@ -0,0 +1,16 @@ +{ + "header":"/opt/rocm-5.0.0/include/hip/hiprtc.h", + "extra_args":[ + "-D__HIP_PLATFORM_AMD__", + "-I.", + "-I/opt/rocm-5.0.0/include/", + "-Ithird_party/llvm/llvm-project/clang/lib/Headers", + "-Ibazel-genfiles", + "-ferror-limit=0" + ], + "enums":[ + "hiprtcResult" + ], + "functions":[ + ] +} diff --git a/backends/gpu/tools/stub_codegen/miopen.json b/backends/gpu/tools/stub_codegen/miopen.json index 956189c9272..63324e14f40 100644 --- a/backends/gpu/tools/stub_codegen/miopen.json +++ b/backends/gpu/tools/stub_codegen/miopen.json @@ -191,5 +191,5 @@ "miopenSetDropoutDescriptor", "miopenDropoutForward", "miopenDropoutBackward" - ], + ] } diff --git a/backends/gpu/tools/stub_codegen/rocblas.json b/backends/gpu/tools/stub_codegen/rocblas.json index 0ebe21fd122..828f938d9b9 100644 --- a/backends/gpu/tools/stub_codegen/rocblas.json +++ b/backends/gpu/tools/stub_codegen/rocblas.json @@ -29,5 +29,5 @@ "rocblas_dtrsm_batched", "rocblas_ctrsm_batched", "rocblas_ztrsm_batched" - ], + ] } diff --git a/backends/gpu/tools/stub_codegen/rocsolver.json b/backends/gpu/tools/stub_codegen/rocsolver.json index fe4fc6564a2..392a10acd9b 100644 --- a/backends/gpu/tools/stub_codegen/rocsolver.json +++ b/backends/gpu/tools/stub_codegen/rocsolver.json @@ -9,5 +9,5 @@ "rocsolver_dpotrf", "rocsolver_cpotrf", "rocsolver_zpotrf" - ], + ] } diff --git a/third_party/hip/hip_stub.h.inc b/third_party/hip/hip_stub.h.inc index 7892e2e7995..119e3dd319b 100644 --- a/third_party/hip/hip_stub.h.inc +++ b/third_party/hip/hip_stub.h.inc @@ -462,3 +462,17 @@ enum hipDataType { HIP_C_32F = 4, HIP_C_64F = 5, }; +enum hiprtcResult { + HIPRTC_SUCCESS = 0, + HIPRTC_ERROR_OUT_OF_MEMORY = 1, + HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, + HIPRTC_ERROR_INVALID_INPUT = 3, + HIPRTC_ERROR_INVALID_PROGRAM = 4, + HIPRTC_ERROR_INVALID_OPTION = 5, + HIPRTC_ERROR_COMPILATION = 6, + HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, + HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, + HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, + HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, + HIPRTC_ERROR_INTERNAL_ERROR = 11, +}; diff --git a/third_party/hip/hipfft_stub.cc.inc b/third_party/hip/hipfft_stub.cc.inc index ffe4cebd3eb..f0296ba7cd8 100644 --- a/third_party/hip/hipfft_stub.cc.inc +++ b/third_party/hip/hipfft_stub.cc.inc @@ -1,3 +1,8 @@ +HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan) { + return DynamicCall("hipfftCreate", + plan); +} + HIPFFT_EXPORT hipfftResult hipfftMakePlanMany64( hipfftHandle plan, int rank, long long int* n, long long int* inembed, long long int istride, long long int idist, long long int* onembed, @@ -13,17 +18,19 @@ HIPFFT_EXPORT hipfftResult hipfftGetSize(hipfftHandle plan, size_t* workSize) { plan, workSize); } +HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, + int autoAllocate) { + return DynamicCall("hipfftSetAutoAllocation", plan, + autoAllocate); +} + HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan, void* workArea) { return DynamicCall( "hipfftSetWorkArea", plan, workArea); } -HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, int enable) { - return DynamicCall( - "hipfftSetAutoAllocation", plan, enable); -} - HIPFFT_EXPORT hipfftResult hipfftExecC2C(hipfftHandle plan, hipfftComplex* idata, hipfftComplex* odata, int direction) { @@ -72,11 +79,6 @@ HIPFFT_EXPORT hipfftResult hipfftSetStream(hipfftHandle plan, "hipfftSetStream", plan, stream); } -HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan) { - return DynamicCall("hipfftCreate", - plan); -} - HIPFFT_EXPORT hipfftResult hipfftDestroy(hipfftHandle plan) { return DynamicCall("hipfftDestroy", plan); diff --git a/third_party/hip/hipfft_stub.h.inc b/third_party/hip/hipfft_stub.h.inc index fee4121a479..cc70b112986 100644 --- a/third_party/hip/hipfft_stub.h.inc +++ b/third_party/hip/hipfft_stub.h.inc @@ -32,17 +32,20 @@ typedef enum hipfftLibraryPropertyType_t { HIPFFT_PATCH_LEVEL, } hipfftLibraryPropertyType; +HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan); + HIPFFT_EXPORT hipfftResult hipfftMakePlanMany64( hipfftHandle plan, int rank, long long int* n, long long int* inembed, long long int istride, long long int idist, long long int* onembed, long long int ostride, long long int odist, hipfftType type, - long long int batch, size_t* work_size); + long long int batch, size_t* workSize); HIPFFT_EXPORT hipfftResult hipfftGetSize(hipfftHandle plan, size_t* workSize); -HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan, void* workArea); +HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, + int autoAllocate); -HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, int enable); +HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan, void* workArea); HIPFFT_EXPORT hipfftResult hipfftExecC2C(hipfftHandle plan, hipfftComplex* idata, @@ -71,8 +74,6 @@ HIPFFT_EXPORT hipfftResult hipfftExecZ2D(hipfftHandle plan, HIPFFT_EXPORT hipfftResult hipfftSetStream(hipfftHandle plan, hipStream_t stream); -HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan); - HIPFFT_EXPORT hipfftResult hipfftDestroy(hipfftHandle plan); HIPFFT_EXPORT hipfftResult hipfftGetProperty(hipfftLibraryPropertyType type,