diff --git a/.github/workflows/helpers/build_target.sh b/.github/workflows/helpers/build_target.sh deleted file mode 100755 index cc4e25cc0b..0000000000 --- a/.github/workflows/helpers/build_target.sh +++ /dev/null @@ -1,9 +0,0 @@ -#! /usr/bin/env bash - -set -euo pipefail - -DIR="$(realpath -- "$(dirname "${BASH_SOURCE[0]}")")" -REPO="$(realpath -- "$DIR/../../../")" - -cd "$REPO/build-ci" -make -j $(( $(nproc) < 2 ? 1 : $(nproc)-1 )) "$@" diff --git a/.github/workflows/helpers/cmake_cuda.sh b/.github/workflows/helpers/cmake_cuda.sh deleted file mode 100755 index f062569efb..0000000000 --- a/.github/workflows/helpers/cmake_cuda.sh +++ /dev/null @@ -1,29 +0,0 @@ -#! /usr/bin/env bash - -set -euo pipefail -set -x - -DIR="$(realpath -- "$(dirname "${BASH_SOURCE[0]}")")" -REPO="$(realpath -- "$DIR/../../../")" - -export FF_GPU_BACKEND="cuda" -export FF_CUDA_ARCH=70 - -if [[ -d "$REPO/build-ci" ]]; then - rm -rf "$REPO/build-ci" -fi -mkdir "$REPO/build-ci" -cd "$REPO/build-ci" -#if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then -# export FF_BUILD_ALL_EXAMPLES=ON -# export FF_BUILD_UNIT_TESTS=ON -#fi -IFS=" " read -r -a FLAGS <<< "$CMAKE_FLAGS" -../config/config.linux \ - -DCMAKE_C_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \ - -DFF_USE_CODE_COVERAGE=ON \ - "${FLAGS[@]}" - -# vim: set tabstop=2 shiftwidth=2 expandtab: diff --git a/.github/workflows/helpers/free_space_on_runner_gpu.sh b/.github/workflows/helpers/free_space_on_runner_gpu.sh new file mode 100755 index 0000000000..a382ee58f6 --- /dev/null +++ b/.github/workflows/helpers/free_space_on_runner_gpu.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -euo pipefail +set -x + +sudo rm -rf /usr/share/dotnet +sudo rm -rf /usr/local/lib/android +sudo rm -rf /opt/ghc +sudo rm -rf "/usr/local/share/boost" diff --git a/.github/workflows/helpers/gpu_ci_helper.py b/.github/workflows/helpers/gpu_ci_helper.py deleted file mode 100644 index c29994795f..0000000000 --- a/.github/workflows/helpers/gpu_ci_helper.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 - -from github import Github -import os, sys, argparse, time - - -def get_num_workflow_runs(repo, workflow_names, in_progress_only=False): - workflows = [ - w for w in repo.get_workflows() for w_name in workflow_names if w.path == w_name - ] - if len(workflows) != len(workflow_names): - print( - f"Found {len(workflows)} workflows instead of {len(workflow_names)}. Weird." - ) - sys.exit(1) - count = 0 - for workflow in workflows: - running_states = ( - ["in_progress"] if in_progress_only else ["queued", "in_progress"] - ) - runs = [ - run for status in running_states for run in workflow.get_runs(status=status) - ] - count += len(runs) - return count - - -if __name__ == "__main__": - - # Check who is running this script (the daemon or a regular gpu-ci runner) - parser = argparse.ArgumentParser() - parser.add_argument("--daemon", action="store_true") - args = parser.parse_args() - - # Log into the GitHub API and get a handle to the repo - git_token = os.getenv("FLEXFLOW_TOKEN") or "" - if len(git_token) < 40: - print("FLEXFLOW_TOKEN not set properly") - sys.exit(1) - git_client = Github(git_token) - if not git_client: - print("Could not get a Git client") - sys.exit(1) - repo = git_client.get_repo("flexflow/FlexFlow") - if not repo: - print("Could not access the FlexFlow repo") - sys.exit(1) - - if args.daemon: - print("Running the daemon...") - # Check if there is any `gpu-ci` workflow in progress or queued - target_workflows = [ - ".github/workflows/gpu-ci.yml", - ".github/workflows/multinode-test.yml", - ] - n = get_num_workflow_runs(repo, target_workflows, in_progress_only=False) - print(f"Detected {n} GPU-related workflow runs in progress or queued") - - instance_id = os.getenv("FLEXFLOW_RUNNER_INSTANCE_ID") or "" - if len(instance_id) != 19: - print("FLEXFLOW_RUNNER_INSTANCE_ID not set properly") - sys.exit(1) - # If there are `gpu-ci` runs in progress or queued, turn on the `flexflow-runner` spot instance, - # if it is not already on. If there are no `gpu-ci` runs in progress or queued, turn off - # the spot instance if it is not already off. - if n > 0: - print("Starting the `flexflow-runner` spot instance (if not already on)...") - os.system( - f"aws ec2 start-instances --region us-east-2 --instance-ids {instance_id}" - ) - else: - print( - "Stopping the `flexflow-runner` spot instance (if not already off)..." - ) - os.system( - f"aws ec2 stop-instances --region us-east-2 --instance-ids {instance_id}" - ) - else: - print("Waiting for the deamon to finish running...") - # Wait until the daemon has finished running - target_workflow = [".github/workflows/gpu-ci-daemon.yml"] - n = get_num_workflow_runs(repo, target_workflow, in_progress_only=True) - while n > 0: - time.sleep(30) - n = get_num_workflow_runs(repo, target_workflow, in_progress_only=True) diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh deleted file mode 100755 index d77745451b..0000000000 --- a/.github/workflows/helpers/install_cudnn.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -set -euo pipefail -set -x - -# Cd into directory holding this script -cd "${BASH_SOURCE[0]%/*}" - -# Install CUDNN -cuda_version=${1:-11.1.1} -cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.') -echo "Installing CUDNN for CUDA version: ${cuda_version} ..." -CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz -CUDNN_TARBALL_NAME=cudnn-11.1-linux-x64-v8.0.5.39.tgz -if [[ "$cuda_version" == "10.1" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-10.1-linux-x64-v8.0.5.39.tgz - CUDNN_TARBALL_NAME=cudnn-10.1-linux-x64-v8.0.5.39.tgz -elif [[ "$cuda_version" == "10.2" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-10.2-linux-x64-v8.0.5.39.tgz - CUDNN_TARBALL_NAME=cudnn-10.2-linux-x64-v8.0.5.39.tgz -elif [[ "$cuda_version" == "11.0" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.0-linux-x64-v8.0.5.39.tgz - CUDNN_TARBALL_NAME=cudnn-11.0-linux-x64-v8.0.5.39.tgz -elif [[ "$cuda_version" == "11.1" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz - CUDNN_TARBALL_NAME=cudnn-11.1-linux-x64-v8.0.5.39.tgz -elif [[ "$cuda_version" == "11.2" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.1.1/cudnn-11.2-linux-x64-v8.1.1.33.tgz - CUDNN_TARBALL_NAME=cudnn-11.2-linux-x64-v8.1.1.33.tgz -elif [[ "$cuda_version" == "11.3" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.1/cudnn-11.3-linux-x64-v8.2.1.32.tgz - CUDNN_TARBALL_NAME=cudnn-11.3-linux-x64-v8.2.1.32.tgz -elif [[ "$cuda_version" == "11.4" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.4/cudnn-11.4-linux-x64-v8.2.4.15.tgz - CUDNN_TARBALL_NAME=cudnn-11.4-linux-x64-v8.2.4.15.tgz -elif [[ "$cuda_version" == "11.5" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.0/cudnn-11.5-linux-x64-v8.3.0.98.tgz - CUDNN_TARBALL_NAME=cudnn-11.5-linux-x64-v8.3.0.98.tgz -elif [[ "$cuda_version" == "11.6" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.4.0/local_installers/11.6/cudnn-linux-x86_64-8.4.0.27_cuda11.6-archive.tar.xz - CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.4.0.27_cuda11.6-archive.tar.xz -elif [[ "$cuda_version" == "11.7" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.5.0/local_installers/11.7/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz - CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz -fi -wget -c -q $CUDNN_LINK -if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" ]]; then - tar -xf $CUDNN_TARBALL_NAME -C ./ - CUDNN_EXTRACTED_TARBALL_NAME="${CUDNN_TARBALL_NAME::-7}" - sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME/include/*" "/usr/local/include" - sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME/lib/*" "/usr/local/lib" - rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME" -else - sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local -fi -rm $CUDNN_TARBALL_NAME -sudo ldconfig diff --git a/.github/workflows/helpers/install_dependencies.sh b/.github/workflows/helpers/install_dependencies.sh deleted file mode 100755 index 5ab211c962..0000000000 --- a/.github/workflows/helpers/install_dependencies.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -set -euo pipefail -set -x - -# Cd into directory holding this script -cd "${BASH_SOURCE[0]%/*}" - -# General dependencies -echo "Installing apt dependencies..." -sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binutils git zlib1g-dev libhdf5-dev && \ - sudo rm -rf /var/lib/apt/lists/* - -# Install CUDNN -./install_cudnn.sh - -# Install HIP dependencies if needed -FF_GPU_BACKEND=${FF_GPU_BACKEND:-"cuda"} -if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then - echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid." - exit 1 -elif [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then - echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies" - wget https://repo.radeon.com/amdgpu-install/22.20.5/ubuntu/focal/amdgpu-install_22.20.50205-1_all.deb - sudo apt-get install -y ./amdgpu-install_22.20.50205-1_all.deb - rm ./amdgpu-install_22.20.50205-1_all.deb - sudo amdgpu-install -y --usecase=hip,rocm --no-dkms - sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk -else - echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies" -fi -sudo rm -rf /var/lib/apt/lists/* diff --git a/.github/workflows/helpers/test_target.sh b/.github/workflows/helpers/test_target.sh deleted file mode 100755 index 69baa66364..0000000000 --- a/.github/workflows/helpers/test_target.sh +++ /dev/null @@ -1,14 +0,0 @@ -#! /usr/bin/env bash - -set -euo pipefail -set -x - -DIR="$(realpath -- "$(dirname "${BASH_SOURCE[0]}")")" -REPO="$(realpath -- "$DIR/../../../")" - -TEST_LIBS=("${@/%/-tests}") -REGEX="^($(IFS='|'; echo "${TEST_LIBS[*]}"))\$" - -cd "$REPO/build-ci" -make -j $(( $(nproc) < 2 ? 1 : $(nproc)-1 )) "${TEST_LIBS[@]}" -ctest --progress --output-on-failure -L "$REGEX" diff --git a/.github/workflows/per-lib-check.yml b/.github/workflows/per-lib-check.yml deleted file mode 100644 index b54ef25819..0000000000 --- a/.github/workflows/per-lib-check.yml +++ /dev/null @@ -1,158 +0,0 @@ -name: "per-lib-checks" -on: [push, pull_request, workflow_dispatch] -concurrency: - group: build-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - cmake-build: - name: Library CMake Build - runs-on: ubuntu-20.04 - - strategy: - max-parallel: 1 - matrix: - gpu_backend: ["cuda"] - fail-fast: false - steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Add helpers directory to path - run: echo "${PWD}/.github/workflows/helpers" >> $GITHUB_PATH - - - name: Free additional space on runner - run: free_space_on_runner.sh - - - name: Install nix - uses: cachix/install-nix-action@v25 - with: - github_access_token: '${{ secrets.GITHUB_TOKEN }}' - - - uses: cachix/cachix-action@v14 - with: - name: ff - skipPush: true - # authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - - - name: setup nix develop shell - uses: nicknovitski/nix-develop@v1.1.0 - with: - arguments: "--accept-flake-config .#ci" - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2 - - # - name: Install system dependencies - # run: FF_GPU_BACKEND=${{ matrix.gpu_backend }} .github/workflows/helpers/install_dependencies.sh - - # - name: Install conda and FlexFlow dependencies - # uses: conda-incubator/setup-miniconda@v2 - # with: - # activate-environment: flexflow - # environment-file: packaging/conda/environment.yml - # auto-activate-base: false - - - name: Regenerate all dtgen files - run: | - proj dtgen --force - - - name: Run cmake - run: | - cmake_${{ matrix.gpu_backend }}.sh - - - name: Build utils - run: | - build_target.sh utils - - - name: Build op-attrs - run: | - build_target.sh op-attrs - - - name: Build pcg - run: | - build_target.sh pcg - - - name: Build kernels - run: | - build_target.sh kernels - - - name: Build substitutions - run: | - build_target.sh substitutions - - - name: Build compiler - run: | - build_target.sh compiler - - - name: Build substitution-generator - run: | - build_target.sh substitution-generator - - - name: Build local-execution - run: | - build_target.sh local-execution - - - name: Build models - run: | - build_target.sh models - - - name: Build substitution-to-dot - run: | - build_target.sh substitution-to-dot - - - name: Build export-model-arch - run: | - build_target.sh export-model-arch - - - name: Test utils - run: | - test_target.sh utils - - - name: Test op-attrs - run: | - test_target.sh op-attrs - - - name: Test pcg - run: | - test_target.sh pcg - - - name: Test substitutions - run: | - test_target.sh substitutions - - - name: Test compiler - run: | - test_target.sh compiler - - - name: Test substitution-generator - run: | - test_target.sh substitution-generator - - - name: Test local-execution - run: | - test_target.sh local-execution - - - name: Test models - run: | - test_target.sh models - - - name: Generate code coverage - run: | - echo "gitwork: $GITHUB_WORKSPACE" - lcov --capture --directory . --output-file main_coverage.info - lcov --extract main_coverage.info "$GITHUB_WORKSPACE/lib/*" --output-file main_coverage.info - lcov --remove main_coverage.info "$GITHUB_WORKSPACE/lib/*.dtg.h" "$GITHUB_WORKSPACE/lib/*.dtg.cc" --output-file main_coverage.info - lcov --list main_coverage.info - - - name: Upload code coverage - uses: codecov/codecov-action@v4 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: main_coverage.info - flags: unittests - name: codecov-umbrella - fail_ci_if_error: false - verbose: true diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000000..7e2dabd784 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,116 @@ +name: "tests" +on: [push, pull_request, workflow_dispatch] +concurrency: + group: build-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + cpu-ci: + name: CPU unit tests and build + runs-on: ubuntu-20.04 + + steps: + - name: Checkout Git Repository + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Free additional space on runner + run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh + + - name: Install nix + uses: cachix/install-nix-action@v25 + with: + github_access_token: '${{ secrets.GITHUB_TOKEN }}' + + - uses: cachix/cachix-action@v14 + with: + name: ff + skipPush: true + # authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' + + - name: setup nix develop shell + uses: nicknovitski/nix-develop@v1.1.0 + env: + NIXPKGS_ALLOW_UNFREE: 1 + with: + arguments: ".#ci --accept-flake-config" + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 + + - name: Regenerate all dtgen files + run: | + proj dtgen --force + + - name: Run cmake + run: | + proj cmake --dtgen-skip + + - name: Run build and tests + run: | + proj test --dtgen-skip -j$(nproc) --coverage --skip-gpu-tests + + - name: Upload code coverage + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: main_coverage.info + flags: unittests + name: codecov-umbrella + fail_ci_if_error: false + verbose: true + + gpu-ci: + name: GPU unit tests + needs: cpu-ci + runs-on: + - runs-on + - family=g4dn.xlarge + - image=ubuntu22-full-x64 + + strategy: + max-parallel: 1 + fail-fast: false + + steps: + - name: checkout git repository + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: free additional space on runner + run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh + + - name: install nix + uses: cachix/install-nix-action@v25 + with: + github_access_token: '${{ secrets.GITHUB_TOKEN }}' + + - uses: cachix/cachix-action@v14 + with: + name: ff + skipPush: true + # authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' + + - name: setup nix develop shell + uses: nicknovitski/nix-develop@v1.1.0 + env: + NIXPKGS_ALLOW_UNFREE: 1 + with: + arguments: ".#gpu-ci --accept-flake-config --impure" + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 + + - name: regenerate all dtgen files + run: | + proj dtgen --force + + - name: run cmake + run: | + proj cmake --dtgen-skip + + - name: build and run gpu tests + run: | + proj test --dtgen-skip -j$(nproc) --skip-build-cpu-tests diff --git a/.proj.toml b/.proj.toml index 5592f184ad..10307a6efa 100644 --- a/.proj.toml +++ b/.proj.toml @@ -18,9 +18,9 @@ build_targets = [ ] test_targets = [ - # "kernels-tests", "utils-tests", "op-attrs-tests", + "kernels-tests", "pcg-tests", "substitutions-tests", "compiler-tests", diff --git a/flake.lock b/flake.lock index 1fb4f26189..1ebfe0b0b4 100644 --- a/flake.lock +++ b/flake.lock @@ -18,6 +18,29 @@ "type": "github" } }, + "nixGL": { + "inputs": { + "flake-utils": [ + "flake-utils" + ], + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1713543440, + "narHash": "sha256-lnzZQYG0+EXl/6NkGpyIz+FEOc/DSEG57AP1VsdeNrM=", + "owner": "nix-community", + "repo": "nixGL", + "rev": "310f8e49a149e4c9ea52f1adf70cdc768ec53f8a", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "nixGL", + "type": "github" + } + }, "nixpkgs": { "locked": { "lastModified": 1710162809, @@ -43,11 +66,11 @@ ] }, "locked": { - "lastModified": 1731206929, - "narHash": "sha256-5O85Ydkk4AG8F3Y5pFj3aywCZwGqmvOj1DFnIXgfyxs=", + "lastModified": 1737085085, + "narHash": "sha256-5b6ytCXd7RTQAt0/4uFbZPre98SkCEKNeedaLsbdYE4=", "owner": "lockshaw", "repo": "proj", - "rev": "99d4df1a81b3b7a6595e9e7913b20f9e6a7f5e21", + "rev": "d6a664dfc4a378d6b9cfaf9937cd9514f164c558", "type": "github" }, "original": { @@ -59,6 +82,7 @@ "root": { "inputs": { "flake-utils": "flake-utils", + "nixGL": "nixGL", "nixpkgs": "nixpkgs", "proj-repo": "proj-repo" } diff --git a/flake.nix b/flake.nix index 38e59a81be..91651bd0c1 100644 --- a/flake.nix +++ b/flake.nix @@ -22,9 +22,15 @@ inputs.nixpkgs.follows = "nixpkgs"; inputs.flake-utils.follows = "flake-utils"; }; + + nixGL = { + url = "github:nix-community/nixGL"; + inputs.nixpkgs.follows = "nixpkgs"; + inputs.flake-utils.follows = "flake-utils"; + }; }; - outputs = { self, nixpkgs, flake-utils, proj-repo, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system: + outputs = { self, nixpkgs, flake-utils, proj-repo, nixGL, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system: let pkgs = import nixpkgs { inherit system; @@ -65,24 +71,20 @@ ci = mkShell { shellHook = '' export PATH="$HOME/ff/.scripts/:$PATH" + export RC_PARAMS="max_discard_ratio=100" + export CMAKE_FLAGS="-DFF_USE_EXTERNAL_LEGION=ON \ + -DFF_USE_EXTERNAL_NCCL=ON \ + -DFF_USE_EXTERNAL_JSON=ON \ + -DFF_USE_EXTERNAL_FMT=ON \ + -DFF_USE_EXTERNAL_SPDLOG=ON \ + -DFF_USE_EXTERNAL_DOCTEST=ON \ + -DFF_USE_EXTERNAL_RAPIDCHECK=ON \ + -DFF_USE_EXTERNAL_EXPECTED=ON \ + -DFF_USE_EXTERNAL_RANGEV3=ON \ + -DFF_USE_EXTERNAL_BOOST_PREPROCESSOR=ON \ + -DFF_USE_EXTERNAL_TYPE_INDEX=ON" ''; - CMAKE_FLAGS = lib.strings.concatStringsSep " " [ - "-DFF_USE_EXTERNAL_LEGION=ON" - "-DFF_USE_EXTERNAL_NCCL=ON" - "-DFF_USE_EXTERNAL_JSON=ON" - "-DFF_USE_EXTERNAL_FMT=ON" - "-DFF_USE_EXTERNAL_SPDLOG=ON" - "-DFF_USE_EXTERNAL_DOCTEST=ON" - "-DFF_USE_EXTERNAL_RAPIDCHECK=ON" - "-DFF_USE_EXTERNAL_EXPECTED=ON" - "-DFF_USE_EXTERNAL_RANGEV3=ON" - "-DFF_USE_EXTERNAL_BOOST_PREPROCESSOR=ON" - "-DFF_USE_EXTERNAL_TYPE_INDEX=ON" - ]; - - RC_PARAMS = "max_discard_ratio=100"; - buildInputs = builtins.concatLists [ (with pkgs; [ zlib @@ -104,20 +106,30 @@ tl-expected doxygen lcov # for code coverage + compdb + ]) + (with proj-repo.packages.${system}; [ + proj ]) - [ proj ] (with self.packages.${system}; [ legion - hpp2plantuml rapidcheckFull doctest ]) ]; }; + gpu-ci = mkShell { + inputsFrom = [ ci ]; + buildInputs = builtins.concatLists [ + (with nixGL.packages.${system}; [ + nixGLDefault + ]) + ]; + }; + default = mkShell { inputsFrom = [ ci ]; - inherit (ci) CMAKE_FLAGS RC_PARAMS; VIMPLUGINS = lib.strings.concatStringsSep "," [ "${proj-repo.packages.${system}.proj-nvim}" @@ -130,10 +142,8 @@ shellcheck plantuml ruff - compdb jq gh - lcov # for code coverage ]) (with pkgs.python3Packages; [ gitpython @@ -150,9 +160,14 @@ ]) (with self.packages.${system}; [ ffdb + hpp2plantuml ]) ]; }; + + gpu = mkShell { + inputsFrom = [ gpu-ci default ]; + }; }; } ); diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt index 007740b510..00da2d0d70 100644 --- a/lib/kernels/test/CMakeLists.txt +++ b/lib/kernels/test/CMakeLists.txt @@ -15,3 +15,10 @@ ff_add_test_executable( cudart cublas ) + +set(FF_TEST_EXEC_NAME "kernels-tests") +add_custom_command( + TARGET ${FF_TEST_EXEC_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake + DEPENDS ${FF_TEST_EXEC_NAME} +) diff --git a/lib/kernels/test/modify_test_commands.cmake b/lib/kernels/test/modify_test_commands.cmake new file mode 100644 index 0000000000..6494ae2d78 --- /dev/null +++ b/lib/kernels/test/modify_test_commands.cmake @@ -0,0 +1,21 @@ +# modify_test_commands.cmake + +file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake") + +foreach(ctest_tests_file IN LISTS ctest_tests_files) + file(READ "${ctest_tests_file}" content) + + # add nix run prefix + string(REGEX REPLACE + "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" + "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" + content "${content}") + + # add environment + # string(REGEX REPLACE + # "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" + # "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" + # content "${content}") + + file(WRITE "${ctest_tests_file}" "${content}") +endforeach() diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 55ce7da331..2212e384fa 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -21,7 +21,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { std::vector input_accessors = - repeat(num_inputs, [&]() { + repeat(num_inputs, [&]() { return read_only_accessor_from_write_accessor( create_random_filled_accessor_w(input_shape, allocator)); }); @@ -44,9 +44,10 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( create_random_filled_accessor_w(output_shape, allocator)); - std::vector input_grad_accessors = repeat( - num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); - + std::vector input_grad_accessors = + repeat(num_inputs, [&]() { + return allocator.allocate_tensor(input_shape); + }); Kernels::Concat::backward_kernel(managed_stream.raw_stream(), output_grad_accessor, input_grad_accessors, diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 81f3c7183a..e29143e251 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -25,7 +25,8 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), dropout_rate, seed, shape, allocator); auto get_zero_count = [](std::vector const &data) { - return count(data, [](float x) { return x == 0.0f; }); + return std::count_if( + data.begin(), data.end(), [](float x) { return x == 0.0f; }); }; SUBCASE("forward_kernel") { diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 7cc2b28c9e..f2346c9244 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -23,7 +23,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); - std::vector output_ptrs = repeat(num_outputs, [&]() { + std::vector output_ptrs(num_outputs); + generate_n(output_ptrs.begin(), num_outputs, [&]() { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); return output_accessor.get_float_ptr(); diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index abce3fd444..21d4923881 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -5,7 +5,13 @@ #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" +#include #include +#include +#include +#include + +using namespace FlexFlow; GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, Allocator &allocator, @@ -42,7 +48,33 @@ std::vector load_data_to_host_from_device(GenericTensorAccessorR accessor) { template bool contains_non_zero(std::vector &data) { - return !all_of(data, [](T const &val) { return val == 0; }); + return !all_of( + data.begin(), data.end(), [](T const &val) { return val == 0; }); +} + +template +std::vector repeat(std::size_t n, Func &&func) { + std::vector result; + // result.reserve(n); // Sometimes we don't have default constructor for T + for (std::size_t i = 0; i < n; ++i) { + result.push_back(func()); + } + return result; } +// Specialize doctest's StringMaker for std::vector +template <> +struct doctest::StringMaker> { + static doctest::String convert(std::vector const &vec) { + std::ostringstream oss; + for (size_t i = 0; i < vec.size(); ++i) { + oss << vec[i]; + if (i != vec.size() - 1) { + oss << ", "; + } + } + return doctest::String(("[" + oss.str() + "]").c_str()); + } +}; + #endif