Skip to content

Commit 34ec068

Browse files
author
Mohamed Zeidan
committed
extra error handling for cluster connection + k8s connectivity
1 parent f940caf commit 34ec068

File tree

2 files changed

+749
-0
lines changed

2 files changed

+749
-0
lines changed

src/sagemaker/hyperpod/common/cli_decorators.py

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@
77
import click
88
import functools
99
import logging
10+
import subprocess
11+
import os
1012
from kubernetes.client.exceptions import ApiException
13+
from kubernetes import config
14+
from kubernetes.config.config_exception import ConfigException
1115

1216
logger = logging.getLogger(__name__)
1317

@@ -761,6 +765,210 @@ def _check_resources_exist(raw_resource_type: str, namespace: str) -> bool:
761765
logger.debug(f"Failed to check resource existence for {raw_resource_type}: {e}")
762766
return None
763767

768+
769+
def _check_kubernetes_connectivity() -> tuple[bool, str]:
770+
"""
771+
Check if we can connect to Kubernetes cluster.
772+
Returns (is_connected, error_message).
773+
"""
774+
try:
775+
# Try to load kubeconfig and make a simple API call
776+
config.load_kube_config()
777+
from kubernetes import client
778+
779+
# Try to get cluster version - this is a lightweight call that requires authentication
780+
version_api = client.VersionApi()
781+
version_api.get_code()
782+
783+
return True, ""
784+
except ConfigException as e:
785+
if "No configuration found" in str(e):
786+
return False, "no_config"
787+
elif "Invalid kube-config" in str(e):
788+
return False, "invalid_config"
789+
else:
790+
return False, f"config_error: {str(e)}"
791+
except ApiException as e:
792+
if e.status == 401:
793+
return False, "unauthorized"
794+
elif e.status == 403:
795+
return False, "forbidden"
796+
else:
797+
return False, f"api_error: {e.status} {e.reason}"
798+
except Exception as e:
799+
error_str = str(e).lower()
800+
if "unauthorized" in error_str or "401" in error_str:
801+
return False, "unauthorized"
802+
elif "forbidden" in error_str or "403" in error_str:
803+
return False, "forbidden"
804+
elif "connection" in error_str or "timeout" in error_str:
805+
return False, "connection_error"
806+
else:
807+
return False, f"unknown_error: {str(e)}"
808+
809+
def _check_aws_credentials() -> bool:
810+
"""
811+
Check if AWS credentials are available and valid.
812+
"""
813+
try:
814+
import boto3
815+
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
816+
817+
# Try to get caller identity
818+
sts = boto3.client('sts')
819+
sts.get_caller_identity()
820+
return True
821+
except (NoCredentialsError, PartialCredentialsError):
822+
return False
823+
except Exception as e:
824+
logger.debug(f"AWS credentials check failed: {e}")
825+
return False
826+
827+
def _get_current_kubernetes_context() -> str:
828+
"""
829+
Get current Kubernetes context name.
830+
"""
831+
try:
832+
contexts, active_context = config.list_kube_config_contexts()
833+
if active_context:
834+
return active_context.get('name', 'unknown')
835+
return 'none'
836+
except Exception as e:
837+
logger.debug(f"Failed to get current context: {e}")
838+
return 'unknown'
839+
840+
def _generate_kubernetes_auth_error_message(error_type: str) -> str:
841+
"""
842+
Generate helpful error message for Kubernetes authentication issues.
843+
"""
844+
if error_type == "no_config":
845+
return (
846+
"❌ Kubernetes configuration not found.\n"
847+
"No kubeconfig file found. Please ensure you have:\n"
848+
"1. A valid kubeconfig file at ~/.kube/config, or\n"
849+
"2. Set the KUBECONFIG environment variable\n\n"
850+
"To configure cluster access:\n"
851+
" hyp set-cluster-context <cluster-name> --region <region>\n\n"
852+
"💡 This will set up the necessary Kubernetes configuration for your HyperPod cluster."
853+
)
854+
855+
elif error_type == "invalid_config":
856+
return (
857+
"❌ Invalid Kubernetes configuration.\n"
858+
"Your kubeconfig file appears to be corrupted or invalid.\n\n"
859+
"To fix this:\n"
860+
"1. Check your kubeconfig file at ~/.kube/config\n"
861+
"2. Reconfigure cluster access:\n"
862+
" hyp set-cluster-context <cluster-name> --region <region>\n\n"
863+
"💡 This will refresh your cluster configuration with the correct settings."
864+
)
865+
866+
elif error_type == "unauthorized":
867+
current_context = _get_current_kubernetes_context()
868+
aws_creds_valid = _check_aws_credentials()
869+
870+
message = (
871+
"❌ Kubernetes authentication failed (401 Unauthorized).\n"
872+
f"Current context: {current_context}\n\n"
873+
"This usually means your credentials have expired or are invalid.\n\n"
874+
)
875+
876+
if not aws_creds_valid:
877+
message += (
878+
"🔍 AWS credentials issue detected:\n"
879+
"Your AWS credentials appear to be missing or invalid.\n\n"
880+
"To fix this:\n"
881+
"1. Check your AWS credentials:\n"
882+
" aws sts get-caller-identity\n"
883+
"2. If expired, refresh your AWS credentials\n"
884+
"3. Then reconfigure cluster access:\n"
885+
" hyp set-cluster-context <cluster-name> --region <region>\n\n"
886+
"💡 Make sure your AWS credentials have the necessary EKS permissions."
887+
)
888+
else:
889+
message += (
890+
"To fix this:\n"
891+
"1. Reconfigure cluster access:\n"
892+
" hyp set-cluster-context <cluster-name> --region <region>\n"
893+
"2. Try your HyperPod command again\n\n"
894+
"💡 This will refresh your authentication with the cluster."
895+
)
896+
897+
return message
898+
899+
elif error_type == "forbidden":
900+
return (
901+
"❌ Kubernetes access denied (403 Forbidden).\n"
902+
"You don't have permission to access this cluster or resource.\n\n"
903+
"This could mean:\n"
904+
"1. Your user/role lacks the necessary RBAC permissions\n"
905+
"2. You're connected to the wrong cluster\n"
906+
"3. The cluster's access policies have changed\n\n"
907+
"To fix this:\n"
908+
"1. Verify you're using the correct cluster context\n"
909+
"2. Contact your cluster administrator for access\n"
910+
"3. Ensure your AWS role has the necessary EKS permissions"
911+
)
912+
913+
elif error_type == "connection_error":
914+
return (
915+
"❌ Cannot connect to Kubernetes cluster.\n"
916+
"Network connection to the cluster failed.\n\n"
917+
"This could mean:\n"
918+
"1. The cluster is not accessible from your network\n"
919+
"2. The cluster endpoint URL is incorrect\n"
920+
"3. Network connectivity issues\n\n"
921+
"To fix this:\n"
922+
"1. Check your network connection\n"
923+
"2. Verify the cluster is running and accessible\n"
924+
"3. Reconfigure cluster access:\n"
925+
" hyp set-cluster-context <cluster-name> --region <region>"
926+
)
927+
928+
else:
929+
return (
930+
"❌ Kubernetes connection failed.\n"
931+
f"Error: {error_type}\n\n"
932+
"To troubleshoot:\n"
933+
"1. Check your kubeconfig file at ~/.kube/config\n"
934+
"2. Reconfigure cluster access:\n"
935+
" hyp set-cluster-context <cluster-name> --region <region>\n"
936+
"3. Try your HyperPod command again"
937+
)
938+
939+
def _is_kubernetes_operation(func, **kwargs) -> bool:
940+
"""
941+
Detect if this operation requires Kubernetes connectivity.
942+
"""
943+
try:
944+
# Check function name for Kubernetes-related patterns
945+
func_name = func.__name__.lower()
946+
k8s_patterns = ['logs', 'operator', 'pod', 'describe', 'list', 'delete', 'create']
947+
948+
if any(pattern in func_name for pattern in k8s_patterns):
949+
return True
950+
951+
# Check if wrapped function has Kubernetes patterns
952+
if hasattr(func, '__wrapped__'):
953+
wrapped_name = getattr(func.__wrapped__, '__name__', '').lower()
954+
if any(pattern in wrapped_name for pattern in k8s_patterns):
955+
return True
956+
957+
# Check Click command info for Kubernetes patterns
958+
try:
959+
click_ctx = click.get_current_context(silent=True)
960+
if click_ctx and hasattr(click_ctx, 'info_name'):
961+
command_path = str(click_ctx.info_name).lower()
962+
if any(pattern in command_path for pattern in k8s_patterns):
963+
return True
964+
except Exception:
965+
pass
966+
967+
except Exception as e:
968+
logger.debug(f"Failed to detect Kubernetes operation: {e}")
969+
970+
return False
971+
764972
def handle_cli_exceptions():
765973
"""
766974
Template-agnostic decorator with proactive namespace validation and enhanced error handling.
@@ -815,6 +1023,15 @@ def wrapper(*args, **kwargs):
8151023
sys.exit(1)
8161024
return
8171025

1026+
# Kubernetes connectivity check for operations that require it
1027+
if _is_kubernetes_operation(func, **kwargs):
1028+
is_connected, error_type = _check_kubernetes_connectivity()
1029+
if not is_connected:
1030+
auth_error_message = _generate_kubernetes_auth_error_message(error_type)
1031+
click.echo(auth_error_message)
1032+
sys.exit(1)
1033+
return
1034+
8181035
# Execute the command
8191036
try:
8201037
return func(*args, **kwargs)
@@ -828,6 +1045,43 @@ def wrapper(*args, **kwargs):
8281045
sys.exit(1)
8291046
return
8301047

1048+
# 2.1: Enhanced Kubernetes Authentication Error Handling
1049+
# Check for 401 Unauthorized errors and provide helpful guidance
1050+
if isinstance(e, ApiException) and e.status == 401:
1051+
auth_error_message = _generate_kubernetes_auth_error_message("unauthorized")
1052+
click.echo(auth_error_message)
1053+
sys.exit(1)
1054+
return
1055+
elif "401" in str(e) and ("unauthorized" in str(e).lower() or "Unauthorized" in str(e)):
1056+
auth_error_message = _generate_kubernetes_auth_error_message("unauthorized")
1057+
click.echo(auth_error_message)
1058+
sys.exit(1)
1059+
return
1060+
1061+
# 2.2: Enhanced Kubernetes Forbidden Error Handling
1062+
elif isinstance(e, ApiException) and e.status == 403:
1063+
auth_error_message = _generate_kubernetes_auth_error_message("forbidden")
1064+
click.echo(auth_error_message)
1065+
sys.exit(1)
1066+
return
1067+
elif "403" in str(e) and ("forbidden" in str(e).lower() or "Forbidden" in str(e)):
1068+
auth_error_message = _generate_kubernetes_auth_error_message("forbidden")
1069+
click.echo(auth_error_message)
1070+
sys.exit(1)
1071+
return
1072+
1073+
# 2.3: Enhanced Kubernetes Configuration Error Handling
1074+
elif isinstance(e, ConfigException):
1075+
if "No configuration found" in str(e):
1076+
auth_error_message = _generate_kubernetes_auth_error_message("no_config")
1077+
elif "Invalid kube-config" in str(e):
1078+
auth_error_message = _generate_kubernetes_auth_error_message("invalid_config")
1079+
else:
1080+
auth_error_message = _generate_kubernetes_auth_error_message(f"config_error: {str(e)}")
1081+
click.echo(auth_error_message)
1082+
sys.exit(1)
1083+
return
1084+
8311085
# 3: Enhanced 404 Resource Handling with Dynamic Target Detection
8321086
# Check if this is a 404 error that can benefit from enhanced handling
8331087
if isinstance(e, ApiException) and e.status == 404:

0 commit comments

Comments
 (0)