remove macro in py_client_gpu (#417)

mayuyuace · web-flow · commit 0708d7a5e779 · 2024-08-07T10:01:14.000+08:00
diff --git a/third_party/openxla.patch b/third_party/openxla.patch
@@ -428,7 +428,7 @@ index e23dcc3a4c..aaaf22ed81 100644
          system_link_files = {
              "//third_party/systemlibs:BUILD": "bazel/BUILD",
 diff --git a/xla/backends/profiler/plugin/BUILD b/xla/backends/profiler/plugin/BUILD
-index 169a4eaa4e..1b8c0bae04 100644
+index 169a4eaa4e..161e4e0452 100644
 --- a/xla/backends/profiler/plugin/BUILD
 +++ b/xla/backends/profiler/plugin/BUILD
 @@ -62,6 +62,10 @@ cc_library(
@@ -663,7 +663,7 @@ index e2c82bad04..8401ec77d8 100644
  )
  
 diff --git a/xla/python/py_client.cc b/xla/python/py_client.cc
-index 0afd053313..f51f71b4f4 100644
+index 0afd053313..f2116e7f5a 100644
 --- a/xla/python/py_client.cc
 +++ b/xla/python/py_client.cc
 @@ -91,9 +91,9 @@ limitations under the License.
@@ -678,8 +678,17 @@ index 0afd053313..f51f71b4f4 100644
  
  namespace xla {
  
+@@ -670,7 +670,7 @@ PyClient::GetEmitPythonCallbackDescriptor(nb::callable callable,
+ XLA_CPU_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM("xla_python_cpu_callback",
+                                              &XlaPythonCpuCallback);
+ 
+-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
++#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM || TENSORFLOW_USE_SYCL
+ XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(
+     "xla_python_gpu_callback", &XlaPythonGpuCallback,
+     absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value()));
 diff --git a/xla/python/py_client_gpu.cc b/xla/python/py_client_gpu.cc
-index 100d9fd599..91df06ad4e 100644
+index 100d9fd599..642828a9ce 100644
 --- a/xla/python/py_client_gpu.cc
 +++ b/xla/python/py_client_gpu.cc
 @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -690,105 +699,43 @@ index 100d9fd599..91df06ad4e 100644
  
  #include <vector>
  
-@@ -20,7 +21,7 @@ limitations under the License.
+@@ -20,6 +21,8 @@ limitations under the License.
  #include "tsl/platform/errors.h"
  #if TENSORFLOW_USE_ROCM
  #include "rocm/include/hip/hip_runtime.h"
--#else
-+#elif GOOGLE_CUDA
++#elif TENSORFLOW_USE_SYCL
++#include "xla/stream_executor/sycl/sycl_gpu_runtime.h"
+ #else
  #include "third_party/gpus/cuda/include/cuda.h"
  #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
- #endif
-@@ -38,13 +39,15 @@ limitations under the License.
+@@ -38,6 +41,13 @@ limitations under the License.
  #define gpuStreamSynchronize hipStreamSynchronize
  #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
  #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
--#else
-+#elif GOOGLE_CUDA
++#elif TENSORFLOW_USE_SYCL
++#define gpuSuccess SYCL_SUCCESS
++#define gpuStreamHandle ::sycl::queue*
++#define gpuMemcpyAsync SYCLMemcpyAsync
++#define gpuStreamSynchronize SYCLStreamSynchronize
++#define gpuMemcpyDeviceToHost SYCLMemcpyDtoHAsync
++#define gpuMemcpyHostToDevice SYCLMemcpyHtoDAsync
+ #else
  #define gpuSuccess cudaSuccess
  #define gpuStreamHandle CUstream
- #define gpuMemcpyAsync cudaMemcpyAsync
- #define gpuStreamSynchronize cudaStreamSynchronize
- #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
- #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-+#else
-+#define gpuStreamHandle ::sycl::queue*
- #endif
- 
- namespace nb = nanobind;
-@@ -74,13 +77,20 @@ void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
-     }
-     void* buf = new char[arg.size_in_bytes];
-     host_input_buffers[i] = buf;
-+#if TENSORFLOW_USE_SYCL
-+    auto event = stream->memcpy(buf, (const void*)(buffers[i]), arg.size_in_bytes);
-+    event.wait();
-+#else
-     // TODO(b/238441608): Use pinned memory here to speed up the transfer.
-     auto gpu_res = gpuMemcpyAsync(buf, buffers[i], arg.size_in_bytes,
-                                   gpuMemcpyDeviceToHost, stream);
-     CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
-+#endif
-   }
-+#ifndef TENSORFLOW_USE_SYCL
-   CHECK_EQ(gpuStreamSynchronize(stream), gpuSuccess)
-       << "Failed to gpuStreamSynchronize";
-+#endif
-   nb::gil_scoped_acquire gil;
-   nb::tuple host_input_arrays = nb::steal<nb::tuple>(PyTuple_New(arity));
-   for (size_t i = 0; i < arity; ++i) {
-@@ -120,10 +130,15 @@ void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
-     absl::Span<int64_t const> strides(
-         reinterpret_cast<const int64_t*>(array.strides()), array.ndim());
-     if (strides == result.expected_strides) {
-+#ifdef TENSORFLOW_USE_SYCL
-+      auto event = stream->memcpy(buffers[arity + i], array.data(), result.size_in_bytes);
-+      event.wait();
-+#else
-       auto gpu_res =
-           gpuMemcpyAsync(buffers[arity + i], array.data(), result.size_in_bytes,
-                          gpuMemcpyHostToDevice, stream);
-       CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
-+#endif
-     } else {
-       void* temp = new char[result.size_in_bytes];
-       temp_buffers.push_back(temp);
-@@ -138,15 +153,22 @@ void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
-         throw xla::XlaRuntimeError(plan.status().ToString());
-       }
-       plan.value()->Execute(array.data(), temp);
-+#ifdef TENSORFLOW_USE_SYCL
-+      auto event = stream->memcpy(buffers[arity + i], temp, result.size_in_bytes);
-+      event.wait();
-+#else
-       auto gpu_res =
-           gpuMemcpyAsync(buffers[arity + i], temp, result.size_in_bytes,
-                          gpuMemcpyHostToDevice, stream);
-       CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
-+#endif
-     }
-   }
-   nb::gil_scoped_release release;
-+#ifndef TENSORFLOW_USE_SYCL
-   CHECK_EQ(gpuStreamSynchronize(stream), gpuSuccess)
-       << "Failed to gpuStreamSynchronize";
-+#endif
-   for (int i = 0; i < temp_buffers.size(); ++i) {
-     delete[] static_cast<char*>(temp_buffers[i]);
-   }
 diff --git a/xla/python/py_client_gpu.h b/xla/python/py_client_gpu.h
-index d7675e1b6a..571a134431 100644
+index d7675e1b6a..17da528bec 100644
 --- a/xla/python/py_client_gpu.h
 +++ b/xla/python/py_client_gpu.h
-@@ -18,17 +18,28 @@ limitations under the License.
+@@ -18,6 +18,8 @@ limitations under the License.
  
  #if TENSORFLOW_USE_ROCM
  #include "rocm/include/hip/hip_runtime.h"
--#else
-+#elif GOOGLE_CUDA
++#elif TENSORFLOW_USE_SYCL
++#include "xla/stream_executor/sycl/sycl_gpu_runtime.h"
+ #else
  #include "third_party/gpus/cuda/include/cuda.h"
  #endif
- #include "xla/service/custom_call_status.h"
+@@ -25,8 +27,10 @@ limitations under the License.
  
  #if TENSORFLOW_USE_ROCM
  #define gpuStreamHandle hipStream_t
@@ -799,18 +746,7 @@ index d7675e1b6a..571a134431 100644
 +#define gpuStreamHandle ::sycl::queue*
  #endif
  
-+#if TENSORFLOW_USE_SYCL
-+#if __has_include(<sycl/sycl.hpp>)
-+#include <sycl/sycl.hpp>
-+#elif __has_include(<CL/sycl.hpp>)
-+#include <CL/sycl.hpp>
-+#else
-+#error "Unsupported compiler"
-+#endif
-+#endif
  namespace xla {
- 
- void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
 diff --git a/xla/service/BUILD b/xla/service/BUILD
 index bcedb98906..952e4c5f6f 100644
 --- a/xla/service/BUILD
diff --git a/xla/stream_executor/sycl/sycl_gpu_runtime.cc b/xla/stream_executor/sycl/sycl_gpu_runtime.cc
@@ -470,6 +470,18 @@ SYCLError_t SYCLMemsetD32Async(void* dstDevice, unsigned int ui, size_t N,
   return SYCL_SUCCESS;
 }
 
+SYCLError_t SYCLMemcpyAsync(void* dst, const void* src, size_t ByteCount,
+                            SYCLError_t (*func)(void*, const void*, size_t, sycl::queue*),
+                            sycl::queue* stream){
+  return (*func)(dst, src, ByteCount, stream);
+}
+
+SYCLError_t SYCLStreamSynchronize(sycl::queue* stream){
+  stream->wait();
+  return SYCL_SUCCESS;
+}
+
+
 void* SYCLMalloc(sycl::device* device, size_t ByteCount) {
   sycl::queue* stream;
   SYCLStreamPool::getDefaultStream(device, &stream);
diff --git a/xla/stream_executor/sycl/sycl_gpu_runtime.h b/xla/stream_executor/sycl/sycl_gpu_runtime.h
@@ -99,6 +99,12 @@ void* SYCLMallocShared(sycl::device* device, size_t ByteCount);
 
 void SYCLFree(sycl::device* device, void* ptr);
 
+SYCLError_t SYCLMemcpyAsync(void* dst, const void* src, size_t ByteCount,
+                            SYCLError_t (*func)(void*, const void*, size_t, sycl::queue*),
+                            sycl::queue* stream);
+
+SYCLError_t SYCLStreamSynchronize(sycl::queue* stream);
+
 sycl::event SYCLGetEventFromStream(sycl::queue* stream);
 
 void SYCLStreamDependOnEvents(sycl::queue* stream,