Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
dfbbc4c
Github action to deploy on EKS
afgambin Jul 31, 2025
a9298e5
Addressing Mano's review - bootstrapping
afgambin Aug 4, 2025
d37aea1
Updating AWS credentials config step
afgambin Aug 4, 2025
11492a4
Adding PR testing
afgambin Aug 4, 2025
4ff9065
Updating the .yaml file
afgambin Aug 5, 2025
c2edbea
Adding a second testing .yaml file
afgambin Aug 5, 2025
76a8cd5
Adding dependencies files
afgambin Aug 5, 2025
434c5d4
Updates to the workflow
afgambin Aug 5, 2025
2fc8824
Update eksctl runner
afgambin Aug 5, 2025
cbde8ce
Updating the action
afgambin Aug 5, 2025
2672f0f
Adding pre-creation cluster steps
afgambin Aug 5, 2025
1b4f8e5
Updated CloudFormation run
afgambin Aug 5, 2025
3e12482
Merge remote-tracking branch 'origin/track/1.10' into kf-7803-gh-acti…
afgambin Aug 5, 2025
658c59e
Testing with a new cluster
afgambin Aug 5, 2025
dcf3128
Updating dependency versions
afgambin Aug 5, 2025
11306fb
Debugging
afgambin Aug 6, 2025
add2842
Updating tox dependencies
afgambin Aug 6, 2025
226f35c
pytest missing
afgambin Aug 6, 2025
0c880e1
Clean up namespace
afgambin Aug 6, 2025
d98ccd8
Teak to the namespace clean up
afgambin Aug 6, 2025
c0bc628
Remove model creation from CLI
afgambin Aug 7, 2025
c4a6014
Passing AWS credentials to tox env
afgambin Aug 7, 2025
c2d4df4
Juju version
afgambin Aug 7, 2025
46964cb
Pinning Juju version to 3.6/stable
afgambin Aug 7, 2025
128b0d9
Pinning Juju version
afgambin Aug 7, 2025
6b80f14
Adding deleting AWS volumes workflow
afgambin Aug 8, 2025
c0bc3cf
Merge remote-tracking branch 'origin/track/1.10' into kf-7803-gh-acti…
afgambin Aug 8, 2025
a794de2
Fixing dependencies
afgambin Aug 8, 2025
b8b5586
Fixing typo with AWS volumes section
afgambin Aug 8, 2025
b7d445a
Setting regions as output for AWS delete volumes
afgambin Aug 8, 2025
1346a0f
Fixing duplicated code
afgambin Aug 8, 2025
00949df
Removing testing sections
afgambin Aug 8, 2025
a1a3400
Bug fixing juju controller step
afgambin Aug 8, 2025
87f6189
Fixing region pass to reusable workflow
afgambin Aug 11, 2025
11cb7dd
Updating AWS credentials config
afgambin Aug 13, 2025
9997908
Testing a version 2 of the action
afgambin Aug 13, 2025
ada5477
Testing without pinning Python version
afgambin Aug 14, 2025
787a1eb
Removing testing yaml file
afgambin Aug 20, 2025
a2cdbbf
Apply suggestions from code review
afgambin Aug 21, 2025
281edc2
Testing without pinning Python version
afgambin Aug 29, 2025
43bf954
Updating action: no Python version pinning needed
afgambin Aug 29, 2025
8ed1137
Removing labels from cluster.yaml file
afgambin Sep 1, 2025
6a9c3f0
K8s version updated in cluster config file
afgambin Sep 5, 2025
98292d2
Merge remote-tracking branch 'origin/track/1.10' into kf-7803-gh-acti…
afgambin Sep 9, 2025
68ecd8b
Removing triggering action with PR
afgambin Sep 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .github/cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
apiVersion: eksctl.io/v1alpha5
availabilityZones:
- eu-central-1a
- eu-central-1b
cloudWatch:
clusterLogging: {}
iam:
vpcResourceControllerPolicy: true
withOIDC: false
addons:
- name: aws-ebs-csi-driver
serviceAccountRoleARN: "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
kind: ClusterConfig
kubernetesNetworkConfig:
ipFamily: IPv4
managedNodeGroups:
- amiFamily: Ubuntu2204
iam:
withAddonPolicies:
ebs: true
instanceType: t2.2xlarge
maxSize: 2
minSize: 2
name: ng-d06bd84e
releaseVersion: ""
ssh:
allow: true
tags:
alpha.eksctl.io/nodegroup-name: ng-d06bd84e
alpha.eksctl.io/nodegroup-type: managed
volumeSize: 100
metadata:
name: kubeflow-test
region: eu-central-1
version: "1.32"
58 changes: 58 additions & 0 deletions .github/workflows/delete-aws-volumes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: Delete unattached (available) EBS volumes

on:
workflow_dispatch:
inputs:
region:
description: "AWS region to clean. Leave empty to clean ALL regions."
required: false
default: ""

workflow_call:
inputs:
region:
description: "AWS region to clean. Leave empty to clean ALL regions."
required: false
default: ""
type: string
secrets:
AWS_ACCESS_KEY_ID:
required: true
AWS_SECRET_ACCESS_KEY:
required: true

jobs:
delete-volumes:
runs-on: ubuntu-24.04

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Configure AWS credentials
# Use your repo/org secrets: AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY
uses: aws-actions/configure-aws-credentials@v2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
# Always needs *some* region; if input empty we'll still iterate all inside the script
aws-region: ${{ inputs.region || 'eu-central-1' }}

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'

- name: Install requirements
run: |
python -m pip install --upgrade pip
pip install boto3 tenacity

- name: Run delete volumes script
run: |
if [ -n "${{ inputs.region }}" ]; then
python scripts/delete_volumes.py "${{ inputs.region }}"
else
python scripts/delete_volumes.py
fi

197 changes: 197 additions & 0 deletions .github/workflows/deploy-to-eks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
name: Create EKS cluster, deploy kubeflow-mlflow Terraform solution and run UATs

on:
workflow_dispatch:
inputs:
k8s_version:
description: 'Kubernetes version to use for the EKS cluster (e.g. 1.27)'
required: false
uats_branch:
description: 'Branch to run the UATs from, e.g., main or track/1.10'
required: false
schedule:
- cron: "17 02 * * 1"

env:
CLUSTER_NAME: kubeflow-eks-test

jobs:
deploy-solution-to-eks:
name: Deploy CKF + MLFlow solution to EKS
runs-on: ubuntu-24.04
outputs:
aws_region: ${{ steps.extract_region.outputs.region }}

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set envvars from dependencies.yaml
run: |
yq eval 'to_entries | .[] | "\(.key)=\(.value)"' ".github/dependencies.yaml" | while IFS= read -r line; do
echo "$line" >> "$GITHUB_ENV"
done

- name: Update ENV variables from inputs if available
run: |
K8S_VERSION=${{ inputs.k8s_version || env.K8S_VERSION }}
echo "K8S_VERSION=${K8S_VERSION}" >> $GITHUB_ENV
UATS_BRANCH=${{ inputs.uats_branch || env.UATS_BRANCH }}
echo "UATS_BRANCH=${UATS_BRANCH}" >> $GITHUB_ENV

- name: Extract AWS region from cluster.yaml
id: extract_region
run: |
REGION=$(yq e '.metadata.region' .github/cluster.yaml)
echo "AWS_REGION=$REGION" >> $GITHUB_ENV
echo "region=$REGION" >> $GITHUB_OUTPUT

- name: Install CLI tools & dependencies
run: |
pip install tox
sudo snap install juju --channel=${{ env.JUJU_VERSION }}/stable
sudo snap install charmcraft --channel latest/stable --classic
sudo snap install terraform --channel=latest/stable --classic
juju version
terraform --version
charmcraft version

- name: Configure AWS credentials
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
mkdir -p ~/.aws
aws configure set aws_access_key_id "${{ secrets.AWS_ACCESS_KEY_ID }}"
aws configure set aws_secret_access_key "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
aws configure set default.region "${{ env.AWS_REGION }}"
echo "AWS_SDK_LOAD_CONFIG=1" >> "$GITHUB_ENV"

- name: Install kubectl
run: |
sudo snap install kubectl --classic --channel=${{ env.K8S_VERSION }}/stable
mkdir ~/.kube
kubectl version --client

- name: Install eksctl
run: |
PLATFORM=$(uname -s)_amd64
curl -sL "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_${PLATFORM}.tar.gz" | tar xz -C /tmp
sudo mv /tmp/eksctl /usr/local/bin
eksctl version

# Once working, do we want to keep these two pre-deletion steps?
- name: Pre-delete EKS cluster (if exists)
run: |
echo "Attempting to delete EKS cluster '${{ env.CLUSTER_NAME }}' (if it exists)..."
eksctl delete cluster --region ${{ env.AWS_REGION }} --name ${{ env.CLUSTER_NAME }} || echo "Cluster not found or already deleted."

echo "Confirming deletion..."
aws eks describe-cluster --region ${{ env.AWS_REGION }} --name ${{ env.CLUSTER_NAME }} || echo "Cluster no longer exists."

- name: Pre-delete CloudFormation stack (if exists)
run: |
STACK_NAME="eksctl-${{ env.CLUSTER_NAME }}-cluster"
echo "Deleting CloudFormation stack '$STACK_NAME' (if it exists)..."
aws cloudformation delete-stack --region ${{ env.AWS_REGION }} --stack-name "$STACK_NAME" || echo "Stack not found."

echo "Waiting (max 10 minutes) for stack deletion to complete..."
timeout 600s aws cloudformation wait stack-delete-complete --region ${{ env.AWS_REGION }} --stack-name "$STACK_NAME" \
&& echo "Stack deleted." \
|| echo "Stack deletion timed out or failed (continuing)."

echo "Verifying stack is gone..."
aws cloudformation describe-stacks --region ${{ env.AWS_REGION }} --stack-name "$STACK_NAME" 2>/dev/null \
|| echo "Stack no longer exists."

- name: Create EKS cluster
run: |
yq e ".metadata.name |= \"${{ env.CLUSTER_NAME }}\"" -i .github/cluster.yaml
yq e ".metadata.version |= \"${{ env.K8S_VERSION }}\"" -i .github/cluster.yaml

ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa <<<y >/dev/null 2>&1
eksctl create cluster -f .github/cluster.yaml
kubectl get nodes

- name: Configure EKS nodes
run: |
echo "Configuring sysctl on EKS workers"
source ./scripts/gh-actions/set_eks_sysctl_config.sh

- name: Setup Juju controller
run: |
/snap/juju/current/bin/juju add-k8s eks --client
juju bootstrap eks eks-controller

- name: Deploy and assert kubeflow-mlflow solution
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ env.AWS_REGION }}
run: |
tox -c ./modules/kubeflow-mlflow -vve test_deployment -- -vv -s

- name: Run UATs
run: |
git clone https://github.com/canonical/charmed-kubeflow-uats.git ~/charmed-kubeflow-uats
cd ~/charmed-kubeflow-uats
git checkout ${{ env.UATS_BRANCH }}
tox -e uats-remote -- --filter "not feast"

# On failure, capture debugging resources
- name: Select model (for debug)
if: failure() || cancelled()
run: juju switch eks-controller:kubeflow

- name: Save debug artifacts
if: failure() || cancelled()
uses: canonical/kubeflow-ci/actions/dump-charm-debug-artifacts@main

- name: Get juju status
if: failure() || cancelled()
run: juju status

- name: Get juju debug logs
if: failure() || cancelled()
run: juju debug-log --replay --no-tail

- name: Get all Kubernetes resources
if: failure() || cancelled()
run: kubectl get all -A

- name: Describe all pods
if: failure() || cancelled()
run: kubectl describe pods --all-namespaces

- name: Logs from Pending pods
if: failure() || cancelled()
run: |
kubectl -n kubeflow get pods | tail -n +2 | grep Pending | awk '{print $1}' | xargs -r -n1 kubectl -n kubeflow logs --all-containers=true --tail 100

- name: Logs from Failed pods
if: failure() || cancelled()
run: |
kubectl -n kubeflow get pods | tail -n +2 | grep Failed | awk '{print $1}' | xargs -r -n1 kubectl -n kubeflow logs --all-containers=true --tail 100

- name: Logs from CrashLoopBackOff pods
if: failure() || cancelled()
run: |
kubectl -n kubeflow get pods | tail -n +2 | grep CrashLoopBackOff | awk '{print $1}' | xargs -r -n1 kubectl -n kubeflow logs --all-containers=true --tail 100

# Clean up resources
- name: Delete EKS cluster
if: always()
run: eksctl delete cluster --region ${{ env.AWS_REGION }} --name ${{ env.CLUSTER_NAME }}

- name: Delete CloudFormation stack
if: always()
run: aws cloudformation delete-stack --region ${{ env.AWS_REGION }} --stack-name eksctl-${{ env.CLUSTER_NAME }}-cluster

delete-unattached-volumes:
name: Clean unattached EBS volumes
if: always()
needs: [deploy-solution-to-eks]
uses: ./.github/workflows/delete-aws-volumes.yaml
with:
region: ${{ needs.deploy-solution-to-eks.outputs.aws_region }}
secrets: inherit
1 change: 1 addition & 0 deletions modules/kubeflow-mlflow/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@ deps =
tenacity
ops>=2.3.0
juju<4.0.0
pytest
pytest-dependency
description = Test bundle deployment
49 changes: 49 additions & 0 deletions scripts/delete_volumes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Delete unattached EBS volumes (state=available) in all AWS regions
# source: https://towardsthecloud.com/amazon-ec2-delete-unattached-ebs-volumes
import boto3
from tenacity import retry, stop_after_attempt, wait_fixed
import sys

@retry(stop=stop_after_attempt(3), wait=wait_fixed(2), reraise=True)
def delete_volumes_in_region(region_name: str, count: int)-> int:
try:
ec2conn = boto3.resource("ec2", region_name = region_name)
unattached_volumes = [
volume for volume in ec2conn.volumes.all() if (volume.state == "available")
]
for volume in unattached_volumes:
volume.delete()
print(f"Deleted unattached volume {volume.id} in region {region_name}.")
count = count + 1
return count
except Exception as e:
print(f"Error: {e}")
raise e

def validate_region(region_name: str)-> bool:
ec2 = boto3.client("ec2")
regions = ec2.describe_regions()["Regions"]
regions_names = list(map(lambda region: region["RegionName"],regions))
return region_name in regions_names

def delete_volumes() -> None:
count = 0
if len(sys.argv)>1:
region_name = sys.argv[1]
if validate_region(region_name):
count = delete_volumes_in_region(region_name, count)
else:
print("Region from input isn't being used in this AWS account.")
raise Exception
else:
ec2 = boto3.client("ec2")
for region in ec2.describe_regions()["Regions"]:
region_name = region["RegionName"]
count = delete_volumes_in_region(region_name, count)

if count > 0:
print(f"Deleted {count} unattached volumes.")
else:
print("No unattached volumes found for deletion.")

delete_volumes()
Loading