RHOAIENG-32532: Update kueue integration #1
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: rayjob-e2e-with-kueue | |
| on: | |
| pull_request: | |
| branches: | |
| - main | |
| - 'release-*' | |
| - ray-jobs-feature | |
| paths-ignore: | |
| - 'docs/**' | |
| - '**.adoc' | |
| - '**.md' | |
| - 'LICENSE' | |
| concurrency: | |
| group: ${{ github.head_ref }}-${{ github.workflow }} | |
| cancel-in-progress: true | |
| env: | |
| CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" | |
| KUEUE_VERSION: "v0.13.3" | |
| jobs: | |
| kubernetes: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Checkout common repo code | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: 'project-codeflare/codeflare-common' | |
| ref: 'main' | |
| path: 'common' | |
| - name: Checkout CodeFlare operator repository | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: project-codeflare/codeflare-operator | |
| path: codeflare-operator | |
| - name: Checkout Kueue repository | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: kubernetes-sigs/kueue | |
| path: kueue | |
| ref: main | |
| - name: Set Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version-file: './codeflare-operator/go.mod' | |
| cache-dependency-path: | | |
| ./codeflare-operator/go.sum | |
| ./kueue/go.sum | |
| - name: Set up gotestfmt | |
| uses: gotesttools/gotestfmt-action@v2 | |
| with: | |
| token: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Set up specific Python version | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| cache: 'pip' # caching pip dependencies | |
| - name: Setup and start KinD cluster | |
| uses: ./common/github-actions/kind | |
| with: | |
| worker-nodes: 2 # Multiple nodes for testing Kueue scheduling | |
| - name: Build and install Kueue | |
| run: | | |
| cd kueue | |
| echo "Building Kueue..." | |
| make manifests | |
| make install | |
| make kind-image-build | |
| # Load Kueue image into Kind | |
| IMAGE_TAG=$(git describe --tags --dirty --always) | |
| kind load docker-image us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:${IMAGE_TAG} | |
| # Deploy Kueue | |
| kubectl apply --server-side --force-conflicts -k config/default | |
| # Wait for Kueue deployment | |
| kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager | |
| # Patch deployment to use local image | |
| kubectl patch deployment kueue-controller-manager -n kueue-system -p '{"spec":{"template":{"spec":{"containers":[{"name":"manager","imagePullPolicy":"Never"}]}}}}' | |
| cd .. | |
| - name: Deploy KubeRay operator | |
| run: | | |
| echo "Installing KubeRay operator..." | |
| kubectl create -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v1.2.2" | |
| kubectl wait --timeout=120s --for=condition=Available=true deployment -n ray-system kuberay-operator | |
| - name: Add user to KinD | |
| uses: ./common/github-actions/kind-add-user | |
| with: | |
| user-name: sdk-user | |
| - name: Configure RBAC for sdk user with limited permissions | |
| run: | | |
| # Basic permissions | |
| kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses | |
| kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user | |
| kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces | |
| kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user | |
| # Ray permissions | |
| kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters | |
| kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user | |
| kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs | |
| kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user | |
| # Kueue permissions | |
| kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors | |
| kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user | |
| kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues | |
| kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user | |
| kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues | |
| kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user | |
| kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads | |
| kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user | |
| # Additional permissions | |
| kubectl create clusterrole list-secrets --verb=get,list --resource=secrets | |
| kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user | |
| kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods | |
| kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user | |
| kubectl create clusterrole service-reader --verb=get,list,watch --resource=services | |
| kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user | |
| kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward | |
| kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user | |
| kubectl create clusterrole node-reader --verb=get,list --resource=nodes | |
| kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user | |
| kubectl config use-context sdk-user | |
| - name: Run RayJob e2e tests | |
| run: | | |
| export CODEFLARE_TEST_OUTPUT_DIR=$(mktemp -d) | |
| echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV | |
| set -euo pipefail | |
| pip install poetry | |
| poetry install --with test,docs | |
| # Install the SDK in editable mode | |
| pip install -e . | |
| echo "Running RayJob e2e tests..." | |
| poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 | |
| env: | |
| GRPC_DNS_RESOLVER: "native" | |
| - name: Switch to kind-cluster context to print logs | |
| if: always() | |
| run: kubectl config use-context kind-cluster | |
| - name: Print Pytest output log | |
| if: always() | |
| run: | | |
| echo "Printing Pytest output logs" | |
| cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log || true | |
| - name: Print Kueue operator logs | |
| if: always() | |
| run: | | |
| echo "Printing Kueue operator logs" | |
| kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kueue-operator.log || true | |
| - name: Print KubeRay operator logs | |
| if: always() | |
| run: | | |
| echo "Printing KubeRay operator logs" | |
| kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log || true | |
| - name: Export all KinD pod logs | |
| uses: ./common/github-actions/kind-export-logs | |
| if: always() | |
| with: | |
| output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }} | |
| - name: Upload logs | |
| uses: actions/upload-artifact@v4 | |
| if: always() | |
| with: | |
| name: logs | |
| retention-days: 10 | |
| path: | | |
| ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log |