Skip to content

RHOAIENG-32532: Update kueue integration #5

RHOAIENG-32532: Update kueue integration

RHOAIENG-32532: Update kueue integration #5

name: rayjob-e2e-with-kueue
on:
pull_request:
branches:
- main
- 'release-*'
- ray-jobs-feature
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true
env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
KUEUE_VERSION: "v0.13.3"
jobs:
kubernetes:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
- name: Checkout CodeFlare operator repository
uses: actions/checkout@v4
with:
repository: project-codeflare/codeflare-operator
path: codeflare-operator
- name: Checkout Kueue repository
uses: actions/checkout@v4
with:
repository: kubernetes-sigs/kueue
path: kueue
ref: main
- name: Set Go
uses: actions/setup-go@v5
with:
go-version-file: './codeflare-operator/go.mod'
cache-dependency-path: |
./codeflare-operator/go.sum
./kueue/go.sum
- name: Set up gotestfmt
uses: gotesttools/gotestfmt-action@v2
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip' # caching pip dependencies
- name: Setup and start KinD cluster
uses: ./common/github-actions/kind
with:
worker-nodes: 2 # Multiple nodes for testing Kueue scheduling
- name: Verify Kind cluster
run: |
echo "Checking Kind clusters..."
kind get clusters
echo "Current kubectl context:"
kubectl config current-context
echo "Checking nodes:"
kubectl get nodes
- name: Build and install Kueue
run: |
cd kueue
echo "Building Kueue..."
make manifests
make install
make kind-image-build
# Get the actual cluster name
CLUSTER_NAME=$(kind get clusters | head -n 1)
echo "Using Kind cluster: ${CLUSTER_NAME}"
# Load Kueue image into Kind - this loads to all nodes
IMAGE_TAG=$(git describe --tags --dirty --always)
echo "Loading image: us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:${IMAGE_TAG}"
kind load docker-image us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:${IMAGE_TAG} --name ${CLUSTER_NAME}
# Also load with 'main' tag as fallback
docker tag us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:${IMAGE_TAG} us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:main
kind load docker-image us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:main --name ${CLUSTER_NAME}
# Deploy Kueue
kubectl apply --server-side --force-conflicts -k config/default
# Patch to use the specific image tag and Never pull policy
kubectl patch deployment kueue-controller-manager -n kueue-system --type='json' -p='[
{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:'${IMAGE_TAG}'"},
{"op": "replace", "path": "/spec/template/spec/containers/0/imagePullPolicy", "value": "Never"}
]'
# Wait for rollout to complete
kubectl rollout status deployment/kueue-controller-manager -n kueue-system --timeout=120s
# Create a default LocalQueue in test namespaces to handle RayJobs that expect it
# This prevents "Handling job with no workload" errors
cd ..
- name: Deploy KubeRay operator
run: |
echo "Installing KubeRay operator..."
# Install KubeRay CRDs and operator
kubectl create -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v1.2.2&timeout=300s" || true
# Wait for namespace to be created
kubectl wait --for=condition=Exists namespace/ray-system --timeout=60s || kubectl create namespace ray-system
# Check if deployment exists and wait for it
kubectl wait --timeout=300s --for=condition=Available=true deployment -n ray-system kuberay-operator || {
echo "KubeRay operator deployment not found, checking pods..."
kubectl get all -n ray-system
exit 1
}
- name: Add user to KinD
uses: ./common/github-actions/kind-add-user
with:
user-name: sdk-user
- name: Configure RBAC for sdk user with limited permissions
run: |
# Basic permissions
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
# Ray permissions
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
# Kueue permissions
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
# Additional permissions
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
kubectl create clusterrole node-reader --verb=get,list --resource=nodes
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
kubectl config use-context sdk-user
- name: Setup test output directory
run: |
CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
- name: Run RayJob e2e tests
run: |
set -euo pipefail
pip install poetry
poetry install --with test,docs
# Install the SDK in editable mode
pip install -e .
echo "Running RayJob e2e tests..."
# Set environment variable to prevent default queue assignment for non-Kueue tests
export DISABLE_DEFAULT_KUEUE_QUEUE=true
# Run only the tests that are designed for Kueue integration
poetry run pytest -v -s ./tests/e2e/rayjob/rayjob_existing_cluster_test.py ./tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py -x > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
env:
GRPC_DNS_RESOLVER: "native"
- name: Switch to kind-cluster context to print logs
if: always()
run: kubectl config use-context kind-cluster
- name: Print Pytest output log
if: always()
run: |
echo "Printing Pytest output logs"
cat ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/pytest_output.log || true
- name: Print Kueue operator logs
if: always()
run: |
echo "Printing Kueue operator logs"
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kueue-operator.log || true
- name: Print KubeRay operator logs
if: always()
run: |
echo "Printing KubeRay operator logs"
echo "Checking ray-system namespace contents:"
kubectl get all -n ray-system || true
echo "Attempting to get KubeRay logs with different selectors:"
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/component=kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \
kubectl logs -n ray-system --tail -1 deployment/kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \
echo "Could not find KubeRay operator logs"
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always()
with:
output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}
- name: Upload logs
uses: actions/upload-artifact@v4
if: always()
with:
name: logs
retention-days: 10
path: |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
if-no-files-found: warn