EnzymeAD
diff --git a/‎.github/workflows/CI.yml
Lines changed: 13 additions & 2 deletions b/‎.github/workflows/CI.yml
Lines changed: 13 additions & 2 deletions
diff --git a/‎Project.toml
Lines changed: 12 additions & 4 deletions b/‎Project.toml
Lines changed: 12 additions & 4 deletions
diff --git a/‎deps/ReactantExtra/.bazelrc
Lines changed: 5 additions & 0 deletions b/‎deps/ReactantExtra/.bazelrc
Lines changed: 5 additions & 0 deletions
diff --git a/‎deps/ReactantExtra/BUILD
Lines changed: 34 additions & 23 deletions b/‎deps/ReactantExtra/BUILD
Lines changed: 34 additions & 23 deletions
diff --git a/‎deps/ReactantExtra/WORKSPACE
Lines changed: 43 additions & 33 deletions b/‎deps/ReactantExtra/WORKSPACE
Lines changed: 43 additions & 33 deletions
@@ -33,6 +33,8 @@ jobs:
     timeout-minutes: 90
     name: Julia ${{ matrix.version }} - ${{ matrix.test_group }} - ${{ matrix.os }} - ${{ matrix.runtime }} - assertions=${{ matrix.assertions }} - ${{ github.event_name }}
     runs-on: ${{ matrix.os }}
+    container:
+      image: ${{ contains(matrix.os, 'linux') && 'ghcr.io/enzymead/reactant-docker-images:main' || '' }}
     strategy:
       fail-fast: false
       matrix:
@@ -59,6 +61,11 @@ jobs:
         assertions:
           - false
         include:
+          - os: linux-x86-ct6e-180-4tpu
+            version: "1.11"
+            assertions: false
+            test_group: core
+            runtime: "IFRT"
           - os: ubuntu-24.04
             version: "1.10"
             assertions: true
@@ -86,9 +93,13 @@ jobs:
           #   libReactant: packaged
           #   version: '1.10'
           #   test_group: integration
-    env:
-      TMPDIR: ${{ github.workspace }}/tmp
     steps:
+      - name: Set TMPDIR
+        # We have to use `${GITHUB_WORKSPACE}` instead of `github.workspace` because GitHub
+        # is terrible and the two don't match inside containers:
+        # https://github.com/actions/runner/issues/2058
+        run: |
+          echo "TMPDIR=${GITHUB_WORKSPACE}/tmp" >> ${GITHUB_ENV}
       - uses: actions/checkout@v4
       - name: Create TMPDIR
         run: |
 
@@ -1,7 +1,7 @@
 name = "Reactant"
 uuid = "3c362404-f566-11ee-1572-e11a4b42c853"
 authors = ["William Moses <[email protected]>", "Valentin Churavy <[email protected]>", "Sergio Sánchez Ramírez <[email protected]>", "Paul Berg <[email protected]>", "Avik Pal <[email protected]>", "Mosè Giordano <[email protected]>"]
-version = "0.2.146"
+version = "0.2.149"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -25,11 +25,14 @@ Reactant_jll = "0192cb87-2b54-54ad-80e0-3be72ad8a3c0"
 ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 Scratch = "6c6a2e73-6563-6170-7368-637461726353"
 Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
+unzip_jll = "88f77b66-78eb-5ed0-bc16-ebba0796830d"
 
 [weakdeps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+DLFP8Types = "f4c16678-4a16-415b-82ef-ed337c5d6c7c"
+Float8s = "81dfefd7-55b0-40c6-a251-db853704e186"
 GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
@@ -43,13 +46,15 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 YaoBlocks = "418bc28f-b43b-5e0b-a6e7-61bbc1a2c1df"
 
-[sources.ReactantCore]
-path = "lib/ReactantCore"
+[sources]
+ReactantCore = {path = "lib/ReactantCore"}
 
 [extensions]
 ReactantAbstractFFTsExt = "AbstractFFTs"
 ReactantArrayInterfaceExt = "ArrayInterface"
 ReactantCUDAExt = ["CUDA", "GPUCompiler", "KernelAbstractions", "LLVM"]
+ReactantDLFP8TypesExt = "DLFP8Types"
+ReactantFloat8sExt = "Float8s"
 ReactantKernelAbstractionsExt = "KernelAbstractions"
 ReactantMPIExt = "MPI"
 ReactantNNlibExt = ["NNlib", "Statistics"]
@@ -67,10 +72,12 @@ Adapt = "4.1"
 ArrayInterface = "7.17.1"
 CEnum = "0.5"
 CUDA = "5.6"
+DLFP8Types = "0.1"
 Downloads = "1.6"
 EnumX = "1"
 Enzyme = "0.13.49"
 EnzymeCore = "0.8.11"
+Float8s = "0.1"
 Functors = "0.5"
 GPUArraysCore = "0.2"
 GPUCompiler = "1.3"
@@ -90,12 +97,13 @@ PythonCall = "0.9.25"
 Random = "1.10"
 Random123 = "1.7"
 ReactantCore = "0.1.15"
-Reactant_jll = "0.0.219"
+Reactant_jll = "0.0.224"
 ScopedValues = "1.3.0"
 Scratch = "1.2"
 Sockets = "1.10"
 SpecialFunctions = "2.4"
 Statistics = "1.10"
+unzip_jll = "6"
 YaoBlocks = "0.13, 0.14"
 julia = "1.10"
 
 
@@ -2,12 +2,14 @@ build --announce_rc
 
 # TODO: Migrate for https://github.com/bazelbuild/bazel/issues/7260
 common --noincompatible_enable_cc_toolchain_resolution
+common --repo_env USE_HERMETIC_CC_TOOLCHAIN=0
 common --experimental_repo_remote_exec
 common --cxxopt=-std=c++17 --host_cxxopt=-std=c++17
 common --cxxopt=-w --host_cxxopt=-w
 common --define=grpc_no_ares=true
 common --noenable_bzlmod
 
+
 build --repo_env=USE_PYWRAP_RULES=True
 build --copt=-DGRPC_BAZEL_BUILD
 build --host_copt=-DGRPC_BAZEL_BUILD
@@ -27,6 +29,7 @@ build:cuda --repo_env TF_NVCC_CLANG=1
 build:cuda --repo_env TF_NCCL_USE_STUB=1
 build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.8.1"
 build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.8.0"
+build:cuda --repo_env=HERMETIC_NVSHMEM_VERSION="3.2.5"
 # "sm" means we emit only cubin, which is forward compatible within a GPU generation.
 # "compute" means we emit both cubin and PTX, which is larger but also forward compatible to future GPU generations.
 build:cuda --repo_env HERMETIC_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,compute_90"
@@ -35,6 +38,8 @@ build:cuda --@local_config_cuda//:enable_cuda
 # Default hermetic CUDA and CUDNN versions.
 build:cuda --@local_config_cuda//cuda:include_cuda_libs=true
 build:cuda --@local_config_cuda//:cuda_compiler=nvcc
+# build:cuda --@local_config_nvshmem//:override_include_nvshmem_libs=true
+# build:cuda --@local_config_nvshmem//cuda:include_nvshmem_libs=true
 
 build:rocm --repo_env TF_NEED_ROCM=1
 build:rocm --define=using_rocm=true
 
@@ -1,5 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@xla//tools/toolchains/cross_compile/cc:cc_toolchain_config.bzl", "cc_toolchain_config")
 
 # load("//toolchain:yggdrasil.bzl", "ygg_cc_toolchain")
@@ -752,6 +753,22 @@ platform(
     ],
 )
 
+platform(
+    name = "win_x86_64",
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:x86_64",
+    ],
+)
+
+platform(
+    name = "win_aarch64",
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:aarch64",
+    ],
+)
+
 cc_library(
     name = "ReactantExtraLib",
     srcs = glob(
@@ -777,12 +794,7 @@ cc_library(
         "-Werror=return-type",
         "-Werror=unused-result",
         "-Wno-error=stringop-truncation",
-    ] + select({
-        "@xla//xla/tsl:is_cuda_enabled_and_oss": [
-            "-DREACTANT_CUDA=1",
-        ],
-        "//conditions:default": [],
-    }),
+    ] + if_cuda(["-DREACTANT_CUDA=1"]),
     linkopts = select({
         "//conditions:default": [],
         "@bazel_tools//src/conditions:darwin": [
@@ -795,6 +807,9 @@ cc_library(
             "-Wl,-exported_symbol,_SetModuleLogLevel",
             "-Wl,-exported_symbol,_GetDefaultTargetTriple",
             "-Wl,-exported_symbol,_enzymeActivityAttrGet",
+            "-Wl,-exported_symbol,_UninitPJRTBuffer",
+            "-Wl,-exported_symbol,_CopyToBuffer",
+            "-Wl,-exported_symbol,_CopyFromBuffer",
             "-Wl,-exported_symbol,_MakeCPUClient",
             "-Wl,-exported_symbol,_MakeGPUClient",
             "-Wl,-exported_symbol,_MakeTPUClient",
@@ -1029,23 +1044,19 @@ cc_library(
         "@xla//xla/tsl/platform:errors",
         "@xla//xla/service:hlo_proto_cc_impl",
         "@com_google_absl//absl/status:statusor",
-    ] + select({
-        "@xla//xla/tsl:is_cuda_enabled_and_oss": [
-            "@jax//jaxlib/cuda:cuda_gpu_kernels",
-            "@xla//xla/backends/profiler:profiler_backends",
-            "@xla//xla/backends/profiler/gpu:device_tracer",
-            "@xla//xla/pjrt/c:pjrt_c_api_gpu_internal",
-            "@xla//xla/service/gpu:gpu_transfer_manager",
-            "@xla//xla/service/gpu:nvptx_compiler",
-            "@xla//xla/service/gpu/model:hlo_op_profile_proto_cc_impl",
-            "@xla//xla/service/gpu/model:hlo_op_profiles",
-            "@xla//xla/stream_executor:cuda_platform",
-            "@xla//xla/stream_executor:kernel",
-            "@xla//xla/stream_executor/cuda:all_runtime",
-        ],
-        "//conditions:default": [
-        ],
-    }) + if_rocm([
+    ] + if_cuda([
+        "@jax//jaxlib/cuda:cuda_gpu_kernels",
+        "@xla//xla/backends/profiler:profiler_backends",
+        "@xla//xla/backends/profiler/gpu:device_tracer",
+        "@xla//xla/pjrt/c:pjrt_c_api_gpu_internal",
+        "@xla//xla/service/gpu:gpu_transfer_manager",
+        "@xla//xla/service/gpu:nvptx_compiler",
+        "@xla//xla/service/gpu/model:hlo_op_profile_proto_cc_impl",
+        "@xla//xla/service/gpu/model:hlo_op_profiles",
+        "@xla//xla/stream_executor:cuda_platform",
+        "@xla//xla/stream_executor:kernel",
+        "@xla//xla/stream_executor/cuda:all_runtime",
+    ]) + if_rocm([
         "@xla//xla/stream_executor:rocm_platform",
         "@xla//xla/service/gpu:amdgpu_compiler",
         "@xla//xla/backends/profiler/gpu:device_tracer",
 
@@ -11,7 +11,7 @@ http_archive(
     urls = ["https://github.com/wsmoses/nsync/archive/{commit}.tar.gz".format(commit = NSYNC_COMMIT)],
 )
 
-ENZYMEXLA_COMMIT = "6774f1afb90c377bbf234a7a7dbfab4f7b726481"
+ENZYMEXLA_COMMIT = "0dc3ef87806ab3c9a695fe5c5689f8e2baf0d6cb"
 
 ENZYMEXLA_SHA256 = ""
 
@@ -68,7 +68,10 @@ CUPTI_NEW = []
 
 XLA_PATCHES = XLA_PATCHES + CUPTI_NEW + [
     """
-sed -i.bak0 "s/kSupportedOpcodes({/kSupportedOpcodes(absl::flat_hash_set<HloOpcode>{/g" xla/service/gpu/gpu_memory_space_assignment.h
+sed -i.bak0 "s/kDeprecatedFlags({/kDeprecatedFlags(absl::flat_hash_set<std::string>{/g" xla/debug_options_flags.cc
+""",
+    """
+sed -i.bak0 "s/kStableFlags({/kStableFlags(absl::flat_hash_set<std::string>{/g" xla/debug_options_flags.cc
 """,
     """
 sed -i.bak0 "s/cupti_driver_cbid/cupti/g" xla/backends/profiler/gpu/cupti_tracer.cc
@@ -102,17 +105,10 @@ sed -i.bak0 "s/patch_cmds = \\[/patch_cmds = \\[\\\"find . -type f -name config.
     # """,
 ]
 
-LLVM_TARGETS = select({
-    "@bazel_tools//src/conditions:windows": [
-        "AMDGPU",
-        "NVPTX",
-    ],
-    "@bazel_tools//src/conditions:darwin": [],
-    "//conditions:default": [
-        "AMDGPU",
-        "NVPTX",
-    ],
-}) + [
+LLVM_TARGETS = [
+    "AMDGPU",
+    "NVPTX",
+] + [
     "AArch64",
     "X86",
     "ARM",
@@ -237,6 +233,17 @@ load("@jax//third_party/xla:workspace.bzl", jax_xla_workspace = "repo")
 
 jax_xla_workspace()
 
+load("@xla//third_party/llvm:workspace.bzl", llvm = "repo")
+
+llvm("llvm-raw")
+
+load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure")
+
+llvm_configure(
+    name = "llvm-project",
+    targets = LLVM_TARGETS,
+)
+
 load("@xla//:workspace4.bzl", "xla_workspace4")
 
 xla_workspace4()
@@ -245,14 +252,8 @@ load("@xla//:workspace3.bzl", "xla_workspace3")
 
 xla_workspace3()
 
-load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure")
 load("@xla//:workspace2.bzl", "xla_workspace2")
 
-llvm_configure(
-    name = "llvm-project",
-    targets = LLVM_TARGETS,
-)
-
 xla_workspace2()
 
 load("@xla//:workspace1.bzl", "xla_workspace1")
@@ -285,7 +286,18 @@ load("@jax//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
 flatbuffers()
 
 load(
-    "@xla//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl",
+    "@rules_ml_toolchain//cc_toolchain/deps:cc_toolchain_deps.bzl",
+    "cc_toolchain_deps",
+)
+
+cc_toolchain_deps()
+
+register_toolchains("@rules_ml_toolchain//cc_toolchain:lx64_lx64")
+
+register_toolchains("@rules_ml_toolchain//cc_toolchain:lx64_lx64_cuda")
+
+load(
+    "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl",
     "cuda_json_init_repository",
 )
 
@@ -297,7 +309,12 @@ load(
     "CUDNN_REDISTRIBUTIONS",
 )
 load(
-    "@xla//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl",
+    "@cuda_redist_json//:distributions.bzl",
+    "CUDA_REDISTRIBUTIONS",
+    "CUDNN_REDISTRIBUTIONS",
+)
+load(
+    "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl",
     "cuda_redist_init_repositories",
     "cudnn_redist_init_repository",
 )
@@ -311,28 +328,28 @@ cudnn_redist_init_repository(
 )
 
 load(
-    "@xla//third_party/gpus/cuda/hermetic:cuda_configure.bzl",
+    "@rules_ml_toolchain//third_party/gpus/cuda/hermetic:cuda_configure.bzl",
     "cuda_configure",
 )
 
 cuda_configure(name = "local_config_cuda")
 
 load(
-    "@xla//third_party/nccl/hermetic:nccl_redist_init_repository.bzl",
+    "@rules_ml_toolchain//third_party/nccl/hermetic:nccl_redist_init_repository.bzl",
     "nccl_redist_init_repository",
 )
 
 nccl_redist_init_repository()
 
 load(
-    "@xla//third_party/nccl/hermetic:nccl_configure.bzl",
+    "@rules_ml_toolchain//third_party/nccl/hermetic:nccl_configure.bzl",
     "nccl_configure",
 )
 
 nccl_configure(name = "local_config_nccl")
 
 load(
-    "@xla//third_party/nvshmem/hermetic:nvshmem_json_init_repository.bzl",
+    "@rules_ml_toolchain//third_party/nvshmem/hermetic:nvshmem_json_init_repository.bzl",
     "nvshmem_json_init_repository",
 )
 
@@ -343,17 +360,10 @@ load(
     "NVSHMEM_REDISTRIBUTIONS",
 )
 load(
-    "@xla//third_party/nvshmem/hermetic:nvshmem_redist_init_repository.bzl",
+    "@rules_ml_toolchain//third_party/nvshmem/hermetic:nvshmem_redist_init_repository.bzl",
     "nvshmem_redist_init_repository",
 )
 
 nvshmem_redist_init_repository(
     nvshmem_redistributions = NVSHMEM_REDISTRIBUTIONS,
 )
-
-load(
-    "@xla//third_party/nvshmem/hermetic:nvshmem_configure.bzl",
-    "nvshmem_configure",
-)
-
-nvshmem_configure(name = "local_config_nvshmem")