Skip to content

Commit 48012d8

Browse files
authored
Merge pull request #530 from AI-Hypercomputer/lidanny/feature/update-to-CoreDNS
feat: Added an update to CoreDNS
2 parents f2340d0 + 98bcf4e commit 48012d8

File tree

1 file changed

+259
-0
lines changed

1 file changed

+259
-0
lines changed

src/xpk/commands/cluster.py

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@
7878
from ..utils.file import write_tmp_file
7979
from . import cluster_gcluster
8080
from .common import set_cluster_command
81+
import shutil
82+
import os
8183

8284

8385
def cluster_adapt(args) -> None:
@@ -247,6 +249,10 @@ def cluster_create(args) -> None:
247249

248250
get_cluster_credentials(args)
249251

252+
update_coredns_command_code = update_coredns_if_necessary(args)
253+
if update_coredns_command_code != 0:
254+
xpk_exit(update_cluster_command_code)
255+
250256
k8s_client = setup_k8s_env(args)
251257

252258
install_storage_crd(k8s_client)
@@ -702,6 +708,259 @@ def cluster_create_ray_cluster(args) -> None:
702708
cluster_create(args)
703709

704710

711+
def install_jq(args):
712+
"""Installs 'jq' utility."""
713+
command_jq_install = 'sudo apt install jq -y'
714+
xpk_print("Task: 'Install jq' in progress.")
715+
return_code = run_command_with_updates(command_jq_install, 'Install jq', args)
716+
if return_code != 0:
717+
xpk_print(f'Install jq error {return_code}')
718+
xpk_exit(return_code)
719+
720+
721+
def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
722+
"""Clones the CoreDNS deployment repository if it doesn't exist."""
723+
if os.path.exists(coredns_repo_full_path):
724+
xpk_print(
725+
f"Directory '{coredns_repo_full_path}' already exists, skip git clone."
726+
)
727+
return
728+
command_git_clone = (
729+
'git clone https://github.com/coredns/deployment.git'
730+
f' {coredns_repo_full_path}'
731+
)
732+
xpk_print(
733+
"Task: 'Clone deployment' in progress, Target"
734+
f' directory:{coredns_repo_full_path}.'
735+
)
736+
return_code = run_command_with_updates(
737+
command_git_clone, 'Clone deployment', args
738+
)
739+
if return_code != 0:
740+
xpk_print(f'Clone deployment error {return_code}')
741+
xpk_exit(return_code)
742+
743+
744+
def deploy_coredns_manifests(args, coredns_k8s_path: str):
745+
"""Deploys CoreDNS manifests to the cluster."""
746+
if not os.path.isdir(coredns_k8s_path):
747+
xpk_print(
748+
f"Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist."
749+
' Has git clone been successful?'
750+
)
751+
xpk_exit(1)
752+
original_cwd = os.getcwd()
753+
try:
754+
os.chdir(coredns_k8s_path)
755+
xpk_print(f'Current working directory changed to: {os.getcwd()}')
756+
757+
command_deploy_coredns = './deploy.sh | kubectl apply -f -'
758+
xpk_print(
759+
f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
760+
)
761+
return_code = run_command_with_updates(
762+
command_deploy_coredns, 'Deploy CoreDNS', args
763+
)
764+
if return_code != 0:
765+
xpk_print(f'Deploy CoreDNS error {return_code}')
766+
767+
finally:
768+
xpk_print(f'Restoring working directory to: {original_cwd}')
769+
os.chdir(original_cwd)
770+
if return_code != 0:
771+
xpk_exit(return_code)
772+
773+
774+
def scale_down_deployment(
775+
args, deployment_name: str, namespace: str = 'kube-system'
776+
):
777+
"""Scales down a specified Kubernetes deployment to 0 replicas."""
778+
command = (
779+
f'kubectl scale deployment {deployment_name} --replicas=0'
780+
f' --namespace={namespace}'
781+
)
782+
xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
783+
return_code = run_command_with_updates(
784+
command, f'Scale down {deployment_name}', args
785+
)
786+
if return_code != 0:
787+
xpk_print(f'Scale down {deployment_name} error {return_code}')
788+
xpk_exit(return_code)
789+
xpk_print(f'\n{deployment_name} has been scaled down.')
790+
791+
792+
def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'):
793+
"""Scales up the CoreDNS deployment to a specified number of replicas."""
794+
command_coredns_scale = (
795+
f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
796+
)
797+
xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
798+
return_code = run_command_with_updates(
799+
command_coredns_scale, 'Scale CoreDNS', args
800+
)
801+
if return_code != 0:
802+
xpk_print(f'Scale CoreDNS error {return_code}')
803+
xpk_exit(return_code)
804+
805+
806+
def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
807+
"""Check for the existence of a specific Deployment in a given namespace."""
808+
command = (
809+
f'kubectl get deployment {deployment_name} -n'
810+
f' {namespace} --ignore-not-found'
811+
)
812+
result = run_command_with_updates(
813+
command, 'Waiting for kubeDNS to be checked.', args
814+
)
815+
return result
816+
817+
818+
def verify_coredns_readiness(
819+
args, timeout: int = 120, namespace: str = 'kube-system'
820+
):
821+
"""Verifies CoreDNS readiness using kubectl wait commands."""
822+
xpk_print('Now verifying CoreDNS readiness...')
823+
kube_dns_exists = check_deployment_exists(args, 'kube-dns', namespace)
824+
if kube_dns_exists:
825+
# Wait for kube-dns to be fully scaled down
826+
command_kube_dns_wait_scaled_down = (
827+
'kubectl wait deployment/kube-dns'
828+
" --for=jsonpath='{.status.replicas}'=0"
829+
f' --namespace={namespace} --timeout={timeout}s'
830+
)
831+
xpk_print('Verifying if kube-dns has scaled down...')
832+
return_code_kube_dns = run_command_with_updates(
833+
command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down', args
834+
)
835+
if return_code_kube_dns != 0:
836+
xpk_print('kube-dns did not scale down successfully within the timeout.')
837+
xpk_exit(1) # Exit if kube-dns cannot scale down
838+
else:
839+
xpk_print('kube-dns has successfully scaled down.')
840+
else:
841+
xpk_print('kube-dns deployment not found.')
842+
# Wait for CoreDNS to be fully scaled up and available
843+
command_coredns_wait_available = (
844+
'kubectl wait deployment/coredns --for=condition=Available=true'
845+
f' --namespace={namespace} --timeout={timeout}s'
846+
)
847+
xpk_print('Verifying if CoreDNS is available...')
848+
return_code_coredns = run_command_with_updates(
849+
command_coredns_wait_available, 'Wait for coredns available', args
850+
)
851+
if return_code_coredns != 0:
852+
xpk_print(
853+
'CoreDNS verification failed, it might not have fully started within'
854+
' the timeout.'
855+
)
856+
xpk_exit(1) # Exit if coredns cannot become available
857+
858+
xpk_print('CoreDNS has successfully started and passed verification.')
859+
860+
861+
def cleanup_coredns_repo(coredns_repo_full_path: str):
862+
"""Deletes the cloned CoreDNS deployment directory."""
863+
xpk_print(
864+
"Task: 'Deleting CoreDNS deployment directory' in progress:"
865+
f' {coredns_repo_full_path}'
866+
)
867+
try:
868+
shutil.rmtree(coredns_repo_full_path)
869+
xpk_print(f'Successfully deleted directory: {coredns_repo_full_path}')
870+
except OSError as e:
871+
xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
872+
873+
874+
def update_coredns(args):
875+
"""Updates and deploys CoreDNS within a cluster.
876+
877+
Args:
878+
args: user provided arguments for running the command.
879+
880+
Returns:
881+
0 if successful and 1 otherwise.
882+
"""
883+
coredns_repo_dir = os.path.expanduser('/tmp/')
884+
coredns_repo_dir_name = 'deployment'
885+
coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
886+
coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
887+
# 1. Install jq
888+
install_jq(args)
889+
890+
# 2. Clone CoreDNS deployment repository
891+
clone_coredns_deployment_repo(args, coredns_repo_full_path)
892+
893+
# 3. Deploy CoreDNS to the cluster
894+
deploy_coredns_manifests(args, coredns_k8s_path)
895+
896+
# 4. Scale down kube-dns-autoscaler
897+
scale_down_deployment(args, 'kube-dns-autoscaler')
898+
899+
# 5. Scale down kube-dns
900+
scale_down_deployment(args, 'kube-dns')
901+
902+
# 6. Scale up coredns and verify readiness
903+
scale_up_coredns(args, replicas=15)
904+
verify_coredns_readiness(args, timeout=120)
905+
906+
xpk_print('The CoreDNS setup process has been completed.')
907+
908+
# 7. Cleanup
909+
cleanup_coredns_repo(coredns_repo_full_path)
910+
911+
return 0
912+
913+
914+
def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
915+
"""Checks if the CoreDNS deployment exists in the given namespace.
916+
917+
Args:
918+
namespace: The Kubernetes namespace to check for the CoreDNS deployment.
919+
920+
Returns:
921+
True if the 'coredns' deployment exists, False otherwise.
922+
"""
923+
command = f'kubectl get deployment coredns -n {namespace}'
924+
xpk_print(
925+
"Task: 'Checking CoreDNS deployment existence' in progress for"
926+
f' namespace: {namespace}'
927+
)
928+
return_code = run_command_with_updates(
929+
command, f'Check CoreDNS deployment in {namespace}', args
930+
)
931+
if return_code == 0:
932+
verify_coredns_readiness(args)
933+
xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
934+
return True
935+
else:
936+
xpk_print(
937+
f"CoreDNS deployment 'coredns' NOT found in namespace '{namespace}' or"
938+
' an error occurred.'
939+
)
940+
return False
941+
942+
943+
def update_coredns_if_necessary(args) -> int:
944+
"""Updates and deploys CoreDNS within the cluster if it's not already present.
945+
946+
This function checks for the existence of the CoreDNS deployment.
947+
If it's not found, it proceeds to deploy and configure CoreDNS.
948+
949+
Args:
950+
args: User-provided arguments for running the command.
951+
952+
Returns:
953+
0 if successful (CoreDNS was already present or successfully deployed),
954+
and 1 otherwise.
955+
"""
956+
if coredns_deployment_exists(args, namespace='kube-system'):
957+
xpk_print('Skipping CoreDNS deployment since it already exists.')
958+
return 0
959+
else:
960+
xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
961+
return update_coredns(args)
962+
963+
705964
def create_cluster_if_necessary(
706965
args, gke_control_plane_version: str, system: SystemCharacteristics
707966
) -> int:

0 commit comments

Comments
 (0)