diff --git a/.github/workflows/helpers/build_target.sh b/.github/workflows/helpers/build_target.sh
deleted file mode 100755
index cc4e25cc0b..0000000000
--- a/.github/workflows/helpers/build_target.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /usr/bin/env bash
-
-set -euo pipefail
-
-DIR="$(realpath -- "$(dirname "${BASH_SOURCE[0]}")")"
-REPO="$(realpath -- "$DIR/../../../")"
-
-cd "$REPO/build-ci"
-make -j $(( $(nproc) < 2 ? 1 : $(nproc)-1 )) "$@"
diff --git a/.github/workflows/helpers/cmake_cuda.sh b/.github/workflows/helpers/cmake_cuda.sh
deleted file mode 100755
index f062569efb..0000000000
--- a/.github/workflows/helpers/cmake_cuda.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#! /usr/bin/env bash
-
-set -euo pipefail
-set -x
-
-DIR="$(realpath -- "$(dirname "${BASH_SOURCE[0]}")")"
-REPO="$(realpath -- "$DIR/../../../")"
-
-export FF_GPU_BACKEND="cuda"
-export FF_CUDA_ARCH=70
-
-if [[ -d "$REPO/build-ci" ]]; then 
-  rm -rf "$REPO/build-ci"
-fi
-mkdir "$REPO/build-ci"
-cd "$REPO/build-ci"
-#if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
-#  export FF_BUILD_ALL_EXAMPLES=ON
-#  export FF_BUILD_UNIT_TESTS=ON
-#fi
-IFS=" " read -r -a FLAGS <<< "$CMAKE_FLAGS"
-../config/config.linux \
-        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-        -DFF_USE_CODE_COVERAGE=ON \
-        "${FLAGS[@]}"
-
-# vim: set tabstop=2 shiftwidth=2 expandtab:
diff --git a/.github/workflows/helpers/free_space_on_runner_gpu.sh b/.github/workflows/helpers/free_space_on_runner_gpu.sh
new file mode 100755
index 0000000000..a382ee58f6
--- /dev/null
+++ b/.github/workflows/helpers/free_space_on_runner_gpu.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -euo pipefail
+set -x
+
+sudo rm -rf /usr/share/dotnet
+sudo rm -rf /usr/local/lib/android
+sudo rm -rf /opt/ghc
+sudo rm -rf "/usr/local/share/boost"
diff --git a/.github/workflows/helpers/gpu_ci_helper.py b/.github/workflows/helpers/gpu_ci_helper.py
deleted file mode 100644
index c29994795f..0000000000
--- a/.github/workflows/helpers/gpu_ci_helper.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env python3
-
-from github import Github
-import os, sys, argparse, time
-
-
-def get_num_workflow_runs(repo, workflow_names, in_progress_only=False):
-    workflows = [
-        w for w in repo.get_workflows() for w_name in workflow_names if w.path == w_name
-    ]
-    if len(workflows) != len(workflow_names):
-        print(
-            f"Found {len(workflows)} workflows instead of {len(workflow_names)}. Weird."
-        )
-        sys.exit(1)
-    count = 0
-    for workflow in workflows:
-        running_states = (
-            ["in_progress"] if in_progress_only else ["queued", "in_progress"]
-        )
-        runs = [
-            run for status in running_states for run in workflow.get_runs(status=status)
-        ]
-        count += len(runs)
-    return count
-
-
-if __name__ == "__main__":
-
-    # Check who is running this script (the daemon or a regular gpu-ci runner)
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--daemon", action="store_true")
-    args = parser.parse_args()
-
-    # Log into the GitHub API and get a handle to the repo
-    git_token = os.getenv("FLEXFLOW_TOKEN") or ""
-    if len(git_token) < 40:
-        print("FLEXFLOW_TOKEN not set properly")
-        sys.exit(1)
-    git_client = Github(git_token)
-    if not git_client:
-        print("Could not get a Git client")
-        sys.exit(1)
-    repo = git_client.get_repo("flexflow/FlexFlow")
-    if not repo:
-        print("Could not access the FlexFlow repo")
-        sys.exit(1)
-
-    if args.daemon:
-        print("Running the daemon...")
-        # Check if there is any `gpu-ci` workflow in progress or queued
-        target_workflows = [
-            ".github/workflows/gpu-ci.yml",
-            ".github/workflows/multinode-test.yml",
-        ]
-        n = get_num_workflow_runs(repo, target_workflows, in_progress_only=False)
-        print(f"Detected {n} GPU-related workflow runs in progress or queued")
-
-        instance_id = os.getenv("FLEXFLOW_RUNNER_INSTANCE_ID") or ""
-        if len(instance_id) != 19:
-            print("FLEXFLOW_RUNNER_INSTANCE_ID not set properly")
-            sys.exit(1)
-        # If there are `gpu-ci` runs in progress or queued, turn on the `flexflow-runner` spot instance,
-        # if it is not already on. If there are no `gpu-ci` runs in progress or queued, turn off
-        # the spot instance if it is not already off.
-        if n > 0:
-            print("Starting the `flexflow-runner` spot instance (if not already on)...")
-            os.system(
-                f"aws ec2 start-instances --region us-east-2 --instance-ids {instance_id}"
-            )
-        else:
-            print(
-                "Stopping the `flexflow-runner` spot instance (if not already off)..."
-            )
-            os.system(
-                f"aws ec2 stop-instances --region us-east-2 --instance-ids {instance_id}"
-            )
-    else:
-        print("Waiting for the deamon to finish running...")
-        # Wait until the daemon has finished running
-        target_workflow = [".github/workflows/gpu-ci-daemon.yml"]
-        n = get_num_workflow_runs(repo, target_workflow, in_progress_only=True)
-        while n > 0:
-            time.sleep(30)
-            n = get_num_workflow_runs(repo, target_workflow, in_progress_only=True)
diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh
deleted file mode 100755
index d77745451b..0000000000
--- a/.github/workflows/helpers/install_cudnn.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-set -x
-
-# Cd into directory holding this script
-cd "${BASH_SOURCE[0]%/*}"
-
-# Install CUDNN
-cuda_version=${1:-11.1.1}
-cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
-echo "Installing CUDNN for CUDA version: ${cuda_version} ..."
-CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
-CUDNN_TARBALL_NAME=cudnn-11.1-linux-x64-v8.0.5.39.tgz
-if [[ "$cuda_version" == "10.1" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-10.1-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-10.1-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "10.2" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-10.2-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-10.2-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "11.0" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.0-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.0-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "11.1" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.1-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "11.2" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.1.1/cudnn-11.2-linux-x64-v8.1.1.33.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.2-linux-x64-v8.1.1.33.tgz
-elif [[ "$cuda_version" == "11.3" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.1/cudnn-11.3-linux-x64-v8.2.1.32.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.3-linux-x64-v8.2.1.32.tgz
-elif [[ "$cuda_version" == "11.4" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.4/cudnn-11.4-linux-x64-v8.2.4.15.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.4-linux-x64-v8.2.4.15.tgz
-elif [[ "$cuda_version" == "11.5" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.0/cudnn-11.5-linux-x64-v8.3.0.98.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.5-linux-x64-v8.3.0.98.tgz
-elif [[ "$cuda_version" == "11.6" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.4.0/local_installers/11.6/cudnn-linux-x86_64-8.4.0.27_cuda11.6-archive.tar.xz
-    CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.4.0.27_cuda11.6-archive.tar.xz
-elif [[ "$cuda_version" == "11.7" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.5.0/local_installers/11.7/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz
-    CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz
-fi
-wget -c -q $CUDNN_LINK
-if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" ]]; then
-    tar -xf $CUDNN_TARBALL_NAME -C ./
-    CUDNN_EXTRACTED_TARBALL_NAME="${CUDNN_TARBALL_NAME::-7}"
-    sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME/include/*" "/usr/local/include"
-    sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME/lib/*" "/usr/local/lib"
-    rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME"
-else
-    sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local
-fi
-rm $CUDNN_TARBALL_NAME 
-sudo ldconfig
diff --git a/.github/workflows/helpers/install_dependencies.sh b/.github/workflows/helpers/install_dependencies.sh
deleted file mode 100755
index 5ab211c962..0000000000
--- a/.github/workflows/helpers/install_dependencies.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-set -x
-
-# Cd into directory holding this script
-cd "${BASH_SOURCE[0]%/*}"
-
-# General dependencies
-echo "Installing apt dependencies..."
-sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binutils git zlib1g-dev libhdf5-dev && \
-    sudo rm -rf /var/lib/apt/lists/*
-
-# Install CUDNN
-./install_cudnn.sh
-
-# Install HIP dependencies if needed
-FF_GPU_BACKEND=${FF_GPU_BACKEND:-"cuda"}
-if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
-  echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid."
-  exit 1
-elif [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then
-    echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"
-    wget https://repo.radeon.com/amdgpu-install/22.20.5/ubuntu/focal/amdgpu-install_22.20.50205-1_all.deb
-    sudo apt-get install -y ./amdgpu-install_22.20.50205-1_all.deb
-    rm ./amdgpu-install_22.20.50205-1_all.deb
-    sudo amdgpu-install -y --usecase=hip,rocm --no-dkms
-    sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk
-else
-    echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"
-fi
-sudo rm -rf /var/lib/apt/lists/*
diff --git a/.github/workflows/helpers/test_target.sh b/.github/workflows/helpers/test_target.sh
deleted file mode 100755
index 69baa66364..0000000000
--- a/.github/workflows/helpers/test_target.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#! /usr/bin/env bash
-
-set -euo pipefail
-set -x
-
-DIR="$(realpath -- "$(dirname "${BASH_SOURCE[0]}")")"
-REPO="$(realpath -- "$DIR/../../../")"
-
-TEST_LIBS=("${@/%/-tests}")
-REGEX="^($(IFS='|'; echo "${TEST_LIBS[*]}"))\$"
-
-cd "$REPO/build-ci"
-make -j $(( $(nproc) < 2 ? 1 : $(nproc)-1 )) "${TEST_LIBS[@]}"
-ctest --progress --output-on-failure -L "$REGEX"
diff --git a/.github/workflows/per-lib-check.yml b/.github/workflows/per-lib-check.yml
deleted file mode 100644
index b54ef25819..0000000000
--- a/.github/workflows/per-lib-check.yml
+++ /dev/null
@@ -1,158 +0,0 @@
-name: "per-lib-checks"
-on: [push, pull_request, workflow_dispatch]
-concurrency:
-  group: build-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  cmake-build:
-    name: Library CMake Build
-    runs-on: ubuntu-20.04
-
-    strategy:
-      max-parallel: 1
-      matrix:
-        gpu_backend: ["cuda"]
-      fail-fast: false
-    steps:
-      - name: Checkout Git Repository
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Add helpers directory to path
-        run: echo "${PWD}/.github/workflows/helpers" >> $GITHUB_PATH
-
-      - name: Free additional space on runner
-        run: free_space_on_runner.sh
-
-      - name: Install nix
-        uses: cachix/install-nix-action@v25
-        with:
-          github_access_token: '${{ secrets.GITHUB_TOKEN }}'
-
-      - uses: cachix/cachix-action@v14
-        with:
-          name: ff
-          skipPush: true
-          # authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-
-      - name: setup nix develop shell
-        uses: nicknovitski/nix-develop@v1.1.0
-        with:
-          arguments: "--accept-flake-config .#ci"
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2
-
-      # - name: Install system dependencies
-      #   run: FF_GPU_BACKEND=${{ matrix.gpu_backend }} .github/workflows/helpers/install_dependencies.sh
-
-      # - name: Install conda and FlexFlow dependencies
-      #   uses: conda-incubator/setup-miniconda@v2
-      #   with:
-      #     activate-environment: flexflow
-      #     environment-file: packaging/conda/environment.yml
-      #     auto-activate-base: false
-
-      - name: Regenerate all dtgen files
-        run: |
-          proj dtgen --force
-
-      - name: Run cmake
-        run: |
-          cmake_${{ matrix.gpu_backend }}.sh
-
-      - name: Build utils
-        run: |
-          build_target.sh utils
-
-      - name: Build op-attrs
-        run: |
-          build_target.sh op-attrs
-
-      - name: Build pcg
-        run: |
-          build_target.sh pcg
-
-      - name: Build kernels
-        run: |
-          build_target.sh kernels
-
-      - name: Build substitutions
-        run: |
-          build_target.sh substitutions
-
-      - name: Build compiler
-        run: |
-          build_target.sh compiler
-
-      - name: Build substitution-generator
-        run: |
-          build_target.sh substitution-generator
-
-      - name: Build local-execution
-        run: |
-          build_target.sh local-execution
-
-      - name: Build models
-        run: |
-          build_target.sh models
-
-      - name: Build substitution-to-dot
-        run: |
-          build_target.sh substitution-to-dot
-
-      - name: Build export-model-arch
-        run: |
-          build_target.sh export-model-arch
-
-      - name: Test utils
-        run: |
-          test_target.sh utils
-
-      - name: Test op-attrs
-        run: |
-          test_target.sh op-attrs
-
-      - name: Test pcg
-        run: |
-          test_target.sh pcg
-
-      - name: Test substitutions
-        run: |
-          test_target.sh substitutions
-
-      - name: Test compiler
-        run: |
-          test_target.sh compiler
-
-      - name: Test substitution-generator
-        run: |
-          test_target.sh substitution-generator
-
-      - name: Test local-execution
-        run: |
-          test_target.sh local-execution
-
-      - name: Test models
-        run: |
-          test_target.sh models
-
-      - name: Generate code coverage
-        run: |
-          echo "gitwork: $GITHUB_WORKSPACE"
-          lcov --capture --directory . --output-file main_coverage.info
-          lcov --extract main_coverage.info "$GITHUB_WORKSPACE/lib/*" --output-file main_coverage.info
-          lcov --remove main_coverage.info "$GITHUB_WORKSPACE/lib/*.dtg.h" "$GITHUB_WORKSPACE/lib/*.dtg.cc"  --output-file main_coverage.info
-          lcov --list main_coverage.info
-      
-      - name: Upload code coverage
-        uses: codecov/codecov-action@v4
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          file: main_coverage.info
-          flags: unittests
-          name: codecov-umbrella
-          fail_ci_if_error: false
-          verbose: true
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000000..7e2dabd784
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,116 @@
+name: "tests"
+on: [push, pull_request, workflow_dispatch]
+concurrency:
+  group: build-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  cpu-ci:
+    name: CPU unit tests and build
+    runs-on: ubuntu-20.04
+
+    steps:
+      - name: Checkout Git Repository
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Free additional space on runner
+        run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh
+
+      - name: Install nix
+        uses: cachix/install-nix-action@v25
+        with:
+          github_access_token: '${{ secrets.GITHUB_TOKEN }}'
+
+      - uses: cachix/cachix-action@v14
+        with:
+          name: ff
+          skipPush: true
+          # authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+
+      - name: setup nix develop shell
+        uses: nicknovitski/nix-develop@v1.1.0
+        env:
+          NIXPKGS_ALLOW_UNFREE: 1
+        with:
+          arguments: ".#ci --accept-flake-config"
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+
+      - name: Regenerate all dtgen files
+        run: |
+          proj dtgen --force
+
+      - name: Run cmake
+        run: |
+          proj cmake --dtgen-skip
+
+      - name: Run build and tests
+        run: |
+          proj test --dtgen-skip -j$(nproc) --coverage --skip-gpu-tests
+
+      - name: Upload code coverage
+        uses: codecov/codecov-action@v4
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          file: main_coverage.info
+          flags: unittests
+          name: codecov-umbrella
+          fail_ci_if_error: false
+          verbose: true
+
+  gpu-ci:
+    name: GPU unit tests
+    needs: cpu-ci
+    runs-on:
+      - runs-on
+      - family=g4dn.xlarge
+      - image=ubuntu22-full-x64
+
+    strategy:
+      max-parallel: 1
+      fail-fast: false
+
+    steps:
+      - name: checkout git repository
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: free additional space on runner
+        run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh
+
+      - name: install nix
+        uses: cachix/install-nix-action@v25
+        with:
+          github_access_token: '${{ secrets.GITHUB_TOKEN }}'
+
+      - uses: cachix/cachix-action@v14
+        with:
+          name: ff
+          skipPush: true
+          # authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+
+      - name: setup nix develop shell
+        uses: nicknovitski/nix-develop@v1.1.0
+        env:
+          NIXPKGS_ALLOW_UNFREE: 1
+        with:
+          arguments: ".#gpu-ci --accept-flake-config --impure"
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+
+      - name: regenerate all dtgen files
+        run: |
+          proj dtgen --force
+
+      - name: run cmake
+        run: |
+          proj cmake --dtgen-skip
+
+      - name: build and run gpu tests
+        run: |
+          proj test --dtgen-skip -j$(nproc) --skip-build-cpu-tests
diff --git a/.proj.toml b/.proj.toml
index 5592f184ad..10307a6efa 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -18,9 +18,9 @@ build_targets = [
 ]
 
 test_targets = [
-  # "kernels-tests",
   "utils-tests",
   "op-attrs-tests",
+  "kernels-tests",
   "pcg-tests",
   "substitutions-tests",
   "compiler-tests",
diff --git a/flake.lock b/flake.lock
index 1fb4f26189..1ebfe0b0b4 100644
--- a/flake.lock
+++ b/flake.lock
@@ -18,6 +18,29 @@
         "type": "github"
       }
     },
+    "nixGL": {
+      "inputs": {
+        "flake-utils": [
+          "flake-utils"
+        ],
+        "nixpkgs": [
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1713543440,
+        "narHash": "sha256-lnzZQYG0+EXl/6NkGpyIz+FEOc/DSEG57AP1VsdeNrM=",
+        "owner": "nix-community",
+        "repo": "nixGL",
+        "rev": "310f8e49a149e4c9ea52f1adf70cdc768ec53f8a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "nixGL",
+        "type": "github"
+      }
+    },
     "nixpkgs": {
       "locked": {
         "lastModified": 1710162809,
@@ -43,11 +66,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1731206929,
-        "narHash": "sha256-5O85Ydkk4AG8F3Y5pFj3aywCZwGqmvOj1DFnIXgfyxs=",
+        "lastModified": 1737085085,
+        "narHash": "sha256-5b6ytCXd7RTQAt0/4uFbZPre98SkCEKNeedaLsbdYE4=",
         "owner": "lockshaw",
         "repo": "proj",
-        "rev": "99d4df1a81b3b7a6595e9e7913b20f9e6a7f5e21",
+        "rev": "d6a664dfc4a378d6b9cfaf9937cd9514f164c558",
         "type": "github"
       },
       "original": {
@@ -59,6 +82,7 @@
     "root": {
       "inputs": {
         "flake-utils": "flake-utils",
+        "nixGL": "nixGL",
         "nixpkgs": "nixpkgs",
         "proj-repo": "proj-repo"
       }
diff --git a/flake.nix b/flake.nix
index 38e59a81be..91651bd0c1 100644
--- a/flake.nix
+++ b/flake.nix
@@ -22,9 +22,15 @@
       inputs.nixpkgs.follows = "nixpkgs";
       inputs.flake-utils.follows = "flake-utils";
     };
+
+    nixGL = {
+      url = "github:nix-community/nixGL";
+      inputs.nixpkgs.follows = "nixpkgs";
+      inputs.flake-utils.follows = "flake-utils";
+    };
   };
 
-  outputs = { self, nixpkgs, flake-utils, proj-repo, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system: 
+  outputs = { self, nixpkgs, flake-utils, proj-repo, nixGL, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system: 
     let 
       pkgs = import nixpkgs {
         inherit system;
@@ -65,24 +71,20 @@
         ci = mkShell {
           shellHook = ''
             export PATH="$HOME/ff/.scripts/:$PATH"
+            export RC_PARAMS="max_discard_ratio=100"
+            export CMAKE_FLAGS="-DFF_USE_EXTERNAL_LEGION=ON \
+                                -DFF_USE_EXTERNAL_NCCL=ON \
+                                -DFF_USE_EXTERNAL_JSON=ON \
+                                -DFF_USE_EXTERNAL_FMT=ON \
+                                -DFF_USE_EXTERNAL_SPDLOG=ON \
+                                -DFF_USE_EXTERNAL_DOCTEST=ON \
+                                -DFF_USE_EXTERNAL_RAPIDCHECK=ON \
+                                -DFF_USE_EXTERNAL_EXPECTED=ON \
+                                -DFF_USE_EXTERNAL_RANGEV3=ON \
+                                -DFF_USE_EXTERNAL_BOOST_PREPROCESSOR=ON \
+                                -DFF_USE_EXTERNAL_TYPE_INDEX=ON"
           '';
           
-          CMAKE_FLAGS = lib.strings.concatStringsSep " " [
-            "-DFF_USE_EXTERNAL_LEGION=ON"
-            "-DFF_USE_EXTERNAL_NCCL=ON"
-            "-DFF_USE_EXTERNAL_JSON=ON"
-            "-DFF_USE_EXTERNAL_FMT=ON"
-            "-DFF_USE_EXTERNAL_SPDLOG=ON"
-            "-DFF_USE_EXTERNAL_DOCTEST=ON"
-            "-DFF_USE_EXTERNAL_RAPIDCHECK=ON"
-            "-DFF_USE_EXTERNAL_EXPECTED=ON"
-            "-DFF_USE_EXTERNAL_RANGEV3=ON"
-            "-DFF_USE_EXTERNAL_BOOST_PREPROCESSOR=ON"
-            "-DFF_USE_EXTERNAL_TYPE_INDEX=ON"
-          ];
-
-          RC_PARAMS = "max_discard_ratio=100";
-
           buildInputs = builtins.concatLists [
             (with pkgs; [
               zlib
@@ -104,20 +106,30 @@
               tl-expected
               doxygen
               lcov # for code coverage
+              compdb
+            ])
+            (with proj-repo.packages.${system}; [
+              proj
             ])
-            [ proj ]
             (with self.packages.${system}; [
               legion
-              hpp2plantuml
               rapidcheckFull
               doctest
             ])
           ];
         };
 
+        gpu-ci = mkShell {
+          inputsFrom = [ ci ];
+          buildInputs = builtins.concatLists [
+            (with nixGL.packages.${system}; [
+              nixGLDefault
+            ])
+          ];
+        };
+
         default = mkShell {
           inputsFrom = [ ci ];
-          inherit (ci) CMAKE_FLAGS RC_PARAMS;
 
           VIMPLUGINS = lib.strings.concatStringsSep "," [
             "${proj-repo.packages.${system}.proj-nvim}"
@@ -130,10 +142,8 @@
               shellcheck
               plantuml
               ruff
-              compdb
               jq
               gh
-              lcov # for code coverage
             ])
             (with pkgs.python3Packages; [
               gitpython
@@ -150,9 +160,14 @@
             ])
             (with self.packages.${system}; [
               ffdb
+              hpp2plantuml
             ])
           ];
         };
+
+        gpu = mkShell {
+          inputsFrom = [ gpu-ci default ];
+        };
       };
     }
   );
diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt
index 007740b510..00da2d0d70 100644
--- a/lib/kernels/test/CMakeLists.txt
+++ b/lib/kernels/test/CMakeLists.txt
@@ -15,3 +15,10 @@ ff_add_test_executable(
     cudart
     cublas
 )
+
+set(FF_TEST_EXEC_NAME "kernels-tests")
+add_custom_command(
+  TARGET ${FF_TEST_EXEC_NAME} POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake
+  DEPENDS ${FF_TEST_EXEC_NAME}
+)
diff --git a/lib/kernels/test/modify_test_commands.cmake b/lib/kernels/test/modify_test_commands.cmake
new file mode 100644
index 0000000000..6494ae2d78
--- /dev/null
+++ b/lib/kernels/test/modify_test_commands.cmake
@@ -0,0 +1,21 @@
+# modify_test_commands.cmake
+
+file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake")
+
+foreach(ctest_tests_file IN LISTS ctest_tests_files)
+  file(READ "${ctest_tests_file}" content)
+
+  # add nix run prefix
+  string(REGEX REPLACE 
+    "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" 
+    "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" 
+    content "${content}")
+
+  # add environment
+  # string(REGEX REPLACE 
+  #   "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" 
+  #   "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" 
+  #   content "${content}")
+
+  file(WRITE "${ctest_tests_file}" "${content}")
+endforeach()
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 55ce7da331..2212e384fa 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -21,7 +21,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       std::vector<GenericTensorAccessorR> input_accessors =
-          repeat(num_inputs, [&]() {
+          repeat<GenericTensorAccessorR>(num_inputs, [&]() {
             return read_only_accessor_from_write_accessor(
                 create_random_filled_accessor_w(input_shape, allocator));
           });
@@ -44,9 +44,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           read_only_accessor_from_write_accessor(
               create_random_filled_accessor_w(output_shape, allocator));
-      std::vector<GenericTensorAccessorW> input_grad_accessors = repeat(
-          num_inputs, [&]() { return allocator.allocate_tensor(input_shape); });
-
+      std::vector<GenericTensorAccessorW> input_grad_accessors =
+          repeat<GenericTensorAccessorW>(num_inputs, [&]() {
+            return allocator.allocate_tensor(input_shape);
+          });
       Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
                                        output_grad_accessor,
                                        input_grad_accessors,
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 81f3c7183a..e29143e251 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -25,7 +25,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         managed_handle.raw_handle(), dropout_rate, seed, shape, allocator);
 
     auto get_zero_count = [](std::vector<float> const &data) {
-      return count(data, [](float x) { return x == 0.0f; });
+      return std::count_if(
+          data.begin(), data.end(), [](float x) { return x == 0.0f; });
     };
 
     SUBCASE("forward_kernel") {
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index 7cc2b28c9e..f2346c9244 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -23,7 +23,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorW input_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
-      std::vector<float *> output_ptrs = repeat(num_outputs, [&]() {
+      std::vector<float *> output_ptrs(num_outputs);
+      generate_n(output_ptrs.begin(), num_outputs, [&]() {
         GenericTensorAccessorW output_accessor =
             allocator.allocate_tensor(output_shape);
         return output_accessor.get_float_ptr();
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index abce3fd444..21d4923881 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -5,7 +5,13 @@
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
+#include <doctest/doctest.h>
 #include <random>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace FlexFlow;
 
 GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
                                                        Allocator &allocator,
@@ -42,7 +48,33 @@ std::vector<T> load_data_to_host_from_device(GenericTensorAccessorR accessor) {
 
 template <typename T>
 bool contains_non_zero(std::vector<T> &data) {
-  return !all_of(data, [](T const &val) { return val == 0; });
+  return !all_of(
+      data.begin(), data.end(), [](T const &val) { return val == 0; });
+}
+
+template <typename T, typename Func>
+std::vector<T> repeat(std::size_t n, Func &&func) {
+  std::vector<T> result;
+  // result.reserve(n); // Sometimes we don't have default constructor for T
+  for (std::size_t i = 0; i < n; ++i) {
+    result.push_back(func());
+  }
+  return result;
 }
 
+// Specialize doctest's StringMaker for std::vector<float>
+template <>
+struct doctest::StringMaker<std::vector<float>> {
+  static doctest::String convert(std::vector<float> const &vec) {
+    std::ostringstream oss;
+    for (size_t i = 0; i < vec.size(); ++i) {
+      oss << vec[i];
+      if (i != vec.size() - 1) {
+        oss << ", ";
+      }
+    }
+    return doctest::String(("[" + oss.str() + "]").c_str());
+  }
+};
+
 #endif