Skip to content

Commit 0de2138

Browse files
authored
Add version comptability check between server K8s and Client python K8s (#138)
* Add k8s version validation check between server and client version according to the supported versioning constraints by k8s * Fix unit test cases * Move regex to a constant. **Description** - Removed an integration test case that was being mocked. - Moved a regex to a constant. **Testing Done** Unit test cases pass no changes made to integration test cases and they should not be affected. * Add k8s version validation check between server and client version according to the supported versioning constraints by k8s * Add ref link for version comptability contraints **Description** Added a link to k8s documentation mentioning the constraints that rule the version compatibility of client and server. **Testing Done** No breaking changes.
1 parent 36fac66 commit 0de2138

File tree

6 files changed

+302
-15
lines changed

6 files changed

+302
-15
lines changed

src/sagemaker/hyperpod/common/utils.py

Lines changed: 168 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
1-
from kubernetes import client
1+
from kubernetes import client, __version__ as kubernetes_client_version
22
from pydantic import ValidationError
33
from kubernetes.client.exceptions import ApiException
44
from kubernetes import config
55
import re
66
import boto3
77
import json
8-
from typing import List
8+
from typing import List, Tuple, Optional
99
import logging
1010
import os
1111
import subprocess
1212
import yaml
13-
from typing import Optional
1413
from kubernetes.config import (
1514
KUBE_CONFIG_DEFAULT_LOCATION,
1615
)
1716

1817
EKS_ARN_PATTERN = r"arn:aws:eks:([\w-]+):\d+:cluster/([\w-]+)"
18+
CLIENT_VERSION_PATTERN = r'^\d+\.\d+\.\d+$'
1919

2020
KUBE_CONFIG_PATH = os.path.expanduser(KUBE_CONFIG_DEFAULT_LOCATION)
2121

@@ -297,3 +297,168 @@ def get_current_region():
297297
return get_region_from_eks_arn(eks_arn)
298298
except:
299299
return boto3.session.Session().region_name
300+
301+
302+
def parse_client_kubernetes_version(version_str: str) -> Tuple[int, int]:
303+
"""Parse major and minor version from client library version string.
304+
305+
Handles both old versioning scheme (v12 and before) and new homogenized scheme.
306+
Old scheme: v12.0.0 corresponds to Kubernetes v1.16
307+
New scheme: v17.0.0 corresponds to Kubernetes v1.17
308+
309+
Args:
310+
version_str (str): Client library version string (e.g., '12.0.0', '17.0.0', 'v12.0.0')
311+
312+
Returns:
313+
Tuple[int, int]: Major and minor version numbers as (1, minor)
314+
"""
315+
if not version_str:
316+
logger = logging.getLogger(__name__)
317+
logger.debug(f"Empty version string provided, Using default version 0.0")
318+
return 0, 0
319+
320+
# Remove suffix (like '+snapshot') if present
321+
version_str = version_str.split('+')[0]
322+
323+
# Remove 'v' prefix if present
324+
if version_str.startswith('v'):
325+
version_str = version_str[1:]
326+
327+
# Client library version format (x.y.z)
328+
if re.match(CLIENT_VERSION_PATTERN, version_str):
329+
major = int(version_str.split('.')[0])
330+
331+
# Old client versioning scheme (v12 and before)
332+
if major <= 12:
333+
# Currently maps to Kubernetes v1.x
334+
# This mapping assumes Kubernetes major version is 1
335+
# If Kubernetes moves to v2.x in the future, this mapping would need to be updated
336+
return 1, major + 4
337+
338+
# New homogenized scheme (v17 and above)
339+
# Currently maps to Kubernetes v1.x
340+
# This mapping assumes Kubernetes major version is 1
341+
# If Kubernetes moves to v2.x in the future, this mapping would need to be updated
342+
return 1, major
343+
344+
# If we get here, parsing failed
345+
logger = logging.getLogger(__name__)
346+
logger.warning(f"Failed to parse client version from string: '{version_str}'. Using default version 0.0.")
347+
return 0, 0
348+
349+
350+
351+
def is_kubernetes_version_compatible(client_version: Tuple[int, int], server_version: Tuple[int, int]) -> bool:
352+
"""
353+
Check if Kubernetes client and server versions are compatible.
354+
355+
Args:
356+
client_version (Tuple[int, int]): Client major and minor version
357+
server_version (Tuple[int, int]): Server major and minor version
358+
359+
Returns:
360+
bool: True if versions are compatible, False otherwise
361+
"""
362+
# Check for default versions (0.0) which indicate parsing failures
363+
if client_version == (0, 0) or server_version == (0, 0):
364+
logger = logging.getLogger(__name__)
365+
logger.warning(
366+
f"Version compatibility check using default version(s): client={client_version}, server={server_version}. "
367+
f"\nThis may indicate a version parsing issue. Please check your Kubernetes configuration."
368+
)
369+
return True
370+
371+
if client_version[0] != server_version[0]:
372+
return False
373+
374+
"""
375+
Client version should not be more than 3 minor versions behind the server and not more than
376+
1 minor version ahead of the server
377+
"""
378+
client_minor = client_version[1]
379+
server_minor = server_version[1]
380+
381+
if server_minor - client_minor > 3:
382+
return False
383+
384+
if client_minor - server_minor > 1:
385+
return False
386+
387+
return True
388+
389+
390+
def verify_kubernetes_version_compatibility(logger) -> bool:
391+
"""
392+
Verify compatibility between Kubernetes client and server versions.
393+
394+
This function checks if the current Kubernetes client version is compatible with
395+
the server version. It handles both minimum compatibility versions specified by
396+
the server and the standard Kubernetes support policy (within 3 minor versions behind
397+
and not more than 1 minor version ahead).
398+
399+
Ref link: https://github.com/kubernetes-client/python#compatibility
400+
401+
Args:
402+
logger: Logger instance for outputting messages.
403+
404+
Returns:
405+
bool: True if versions are compatible, False otherwise
406+
"""
407+
408+
try:
409+
version_api = client.VersionApi()
410+
server_version_info = version_api.get_code()
411+
412+
server_version_str = f"{server_version_info.major}.{server_version_info.minor}"
413+
client_version = parse_client_kubernetes_version(kubernetes_client_version)
414+
client_version_str = f"{client_version[0]}.{client_version[1]}"
415+
416+
# Debug output of server version info
417+
logger.debug(f"Server version info: {server_version_info}")
418+
logger.debug(f"Client version: {kubernetes_client_version}, parsed as {client_version_str}")
419+
420+
# Check if server provides minimum compatibility versions (these are optional strings)
421+
has_min_compatibility = False
422+
is_compatible = True
423+
424+
try:
425+
if hasattr(server_version_info, 'min_compatibility_major') and server_version_info.min_compatibility_major is not None and \
426+
hasattr(server_version_info, 'min_compatibility_minor') and server_version_info.min_compatibility_minor is not None:
427+
min_major = int(server_version_info.min_compatibility_major)
428+
min_minor = int(server_version_info.min_compatibility_minor)
429+
has_min_compatibility = True
430+
431+
# Check if client version is below minimum compatibility
432+
if client_version[0] < min_major or (client_version[0] == min_major and client_version[1] < min_minor):
433+
logger.warning(
434+
f"Kubernetes version incompatibility detected! Your client version {client_version_str} "
435+
f"(package: {kubernetes_client_version}) is below the minimum compatible version {min_major}.{min_minor} "
436+
f"required by server {server_version_str}. The server explicitly requires a minimum client version."
437+
)
438+
logger.warning(
439+
f"To resolve this issue, please update your kubernetes Python client to meet the minimum requirement."
440+
)
441+
is_compatible = False
442+
except (ValueError, TypeError, AttributeError) as e:
443+
logger.debug(f"Could not parse minimum compatibility version: {e}")
444+
has_min_compatibility = False
445+
446+
if not has_min_compatibility:
447+
# Fall back to standard compatibility check if min versions not provided
448+
server_version_parsed = (int(server_version_info.major), int(server_version_info.minor))
449+
if not is_kubernetes_version_compatible(client_version, server_version_parsed):
450+
logger.warning(
451+
f"Kubernetes version incompatibility detected! Your client version {client_version_str} "
452+
f"(package: {kubernetes_client_version}) is not compatible with server version {server_version_str}. "
453+
f"According to Kubernetes support policy, client should be within 3 minor versions behind "
454+
f"and not more than 1 minor version ahead of the server."
455+
)
456+
logger.warning(
457+
f"To resolve this issue, please update your kubernetes Python client to a compatible version."
458+
)
459+
is_compatible = False
460+
461+
return is_compatible
462+
except Exception as e:
463+
logger.warning(f"Failed to verify Kubernetes version compatibility: {e}")
464+
return True # Be lenient if we can't check compatibility

src/sagemaker/hyperpod/inference/hp_endpoint_base.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
handle_exception,
1515
setup_logging,
1616
get_default_namespace,
17+
verify_kubernetes_version_compatibility,
1718
)
1819
from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
1920
_hyperpod_telemetry_emitter,
@@ -24,15 +25,18 @@
2425
class HPEndpointBase:
2526
is_kubeconfig_loaded = False
2627

28+
@classmethod
29+
def get_logger(cls):
30+
return logging.getLogger(__name__)
31+
2732
@classmethod
2833
def verify_kube_config(cls):
2934
if not cls.is_kubeconfig_loaded:
3035
config.load_kube_config()
3136
cls.is_kubeconfig_loaded = True
32-
33-
@classmethod
34-
def get_logger(cls):
35-
return logging.getLogger(__name__)
37+
38+
# Verify Kubernetes version compatibility
39+
verify_kubernetes_version_compatibility(cls.get_logger())
3640

3741
@classmethod
3842
def call_create_api(

src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@
33
_HyperPodPytorchJob, HyperPodPytorchJobStatus
44
)
55
from sagemaker.hyperpod.common.config.metadata import Metadata
6-
from kubernetes import client, config
7-
from typing import List, Optional, ClassVar
6+
from kubernetes import client, config, __version__ as kubernetes_client_version
7+
from typing import List, Optional, ClassVar, Tuple
88
from sagemaker.hyperpod.common.utils import (
99
handle_exception,
1010
get_default_namespace,
1111
setup_logging,
12+
verify_kubernetes_version_compatibility
1213
)
1314
from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
1415
_hyperpod_telemetry_emitter,
@@ -17,6 +18,7 @@
1718
import yaml
1819
import logging
1920

21+
2022
TRAINING_GROUP = "sagemaker.amazonaws.com"
2123
API_VERSION = "v1"
2224
PLURAL = "hyperpodpytorchjobs"
@@ -36,15 +38,18 @@ class HyperPodPytorchJob(_HyperPodPytorchJob):
3638
default=None, description="The status of the HyperPodPytorchJob"
3739
)
3840

41+
@classmethod
42+
def get_logger(cls):
43+
return logging.getLogger(__name__)
44+
3945
@classmethod
4046
def verify_kube_config(cls):
4147
if not cls.is_kubeconfig_loaded:
4248
config.load_kube_config()
4349
cls.is_kubeconfig_loaded = True
44-
45-
@classmethod
46-
def get_logger(cls):
47-
return logging.getLogger(__name__)
50+
51+
# Verify Kubernetes version compatibility
52+
verify_kubernetes_version_compatibility(cls.get_logger())
4853

4954
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create_pytorchjob")
5055
def create(self, debug=False):

test/unit_tests/common/test_utils.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import unittest
22
import subprocess
3-
from unittest.mock import patch, MagicMock, mock_open
3+
import logging
4+
from unittest.mock import patch, MagicMock, mock_open, call
45
from sagemaker.hyperpod.common.utils import (
56
handle_exception,
67
get_eks_name_from_arn,
@@ -11,6 +12,8 @@
1112
list_clusters,
1213
set_cluster_context,
1314
get_cluster_context,
15+
parse_client_kubernetes_version,
16+
is_kubernetes_version_compatible,
1417
)
1518
from kubernetes.client.exceptions import ApiException
1619
from pydantic import ValidationError
@@ -112,6 +115,72 @@ def test_get_region_from_eks_arn_invalid(self):
112115
with self.assertRaises(RuntimeError) as context:
113116
get_region_from_eks_arn("invalid:arn:format")
114117
self.assertIn("cannot get region from EKS ARN", str(context.exception))
118+
119+
def test_parse_client_kubernetes_version_with_v_prefix(self):
120+
"""Test parsing client version with 'v' prefix"""
121+
self.assertEqual(parse_client_kubernetes_version("v12.0.0"), (1, 16))
122+
self.assertEqual(parse_client_kubernetes_version("v17.0.0"), (1, 17))
123+
124+
def test_parse_client_kubernetes_version_old_client_format(self):
125+
"""Test parsing old client version format (v12 and before)"""
126+
# Test old client format (v12 and before)
127+
# v12.0.0 corresponds to Kubernetes v1.16
128+
self.assertEqual(parse_client_kubernetes_version("12.0.0"), (1, 16))
129+
self.assertEqual(parse_client_kubernetes_version("11.0.0"), (1, 15))
130+
self.assertEqual(parse_client_kubernetes_version("10.0.0"), (1, 14))
131+
132+
def test_parse_client_kubernetes_version_new_client_format(self):
133+
"""Test parsing new homogenized client version format (v17+)"""
134+
# Test new homogenized format (v17+)
135+
# v17.0.0 corresponds to Kubernetes v1.17
136+
self.assertEqual(parse_client_kubernetes_version("17.0.0"), (1, 17))
137+
self.assertEqual(parse_client_kubernetes_version("18.0.0"), (1, 18))
138+
self.assertEqual(parse_client_kubernetes_version("24.0.0"), (1, 24))
139+
140+
def test_parse_client_kubernetes_version_with_suffix(self):
141+
"""Test parsing version with suffix"""
142+
self.assertEqual(parse_client_kubernetes_version("24.0.0+snapshot"), (1, 24))
143+
self.assertEqual(parse_client_kubernetes_version("v17.0.0+custom"), (1, 17))
144+
145+
def test_parse_client_kubernetes_version_invalid_format(self):
146+
"""Test parsing invalid version format"""
147+
self.assertEqual(parse_client_kubernetes_version(""), (0, 0))
148+
self.assertEqual(parse_client_kubernetes_version("invalid"), (0, 0))
149+
self.assertEqual(parse_client_kubernetes_version("a.b.c"), (0, 0))
150+
151+
def test_is_kubernetes_version_compatible_same_version(self):
152+
"""Test compatibility check with same versions"""
153+
self.assertTrue(is_kubernetes_version_compatible((1, 24), (1, 24)))
154+
155+
def test_is_kubernetes_version_compatible_within_range(self):
156+
"""Test compatibility check with versions within supported range"""
157+
# Client within 3 minor versions behind server
158+
self.assertTrue(is_kubernetes_version_compatible((1, 23), (1, 24)))
159+
self.assertTrue(is_kubernetes_version_compatible((1, 22), (1, 24)))
160+
self.assertTrue(is_kubernetes_version_compatible((1, 21), (1, 24)))
161+
162+
# Client within 1 minor version ahead of server
163+
self.assertTrue(is_kubernetes_version_compatible((1, 25), (1, 24)))
164+
165+
def test_is_kubernetes_version_compatible_outside_range(self):
166+
"""Test compatibility check with versions outside supported range"""
167+
# Client too old (more than 3 minor versions behind)
168+
self.assertFalse(is_kubernetes_version_compatible((1, 20), (1, 24)))
169+
170+
# Client too new (more than 1 minor version ahead)
171+
self.assertFalse(is_kubernetes_version_compatible((1, 26), (1, 24)))
172+
173+
def test_is_kubernetes_version_compatible_different_major(self):
174+
"""Test compatibility check with different major versions"""
175+
# Different major versions should be incompatible
176+
self.assertFalse(is_kubernetes_version_compatible((2, 0), (1, 0)))
177+
178+
def test_is_kubernetes_version_compatible_default_versions(self):
179+
"""Test compatibility check with default versions (0, 0)"""
180+
# Default versions should be treated as compatible
181+
self.assertTrue(is_kubernetes_version_compatible((0, 0), (1, 24)))
182+
self.assertTrue(is_kubernetes_version_compatible((1, 24), (0, 0)))
183+
self.assertTrue(is_kubernetes_version_compatible((0, 0), (0, 0)))
115184

116185
def test_is_eks_orchestrator_true(self):
117186
mock_client = MagicMock()

test/unit_tests/inference/test_hp_endpoint_base.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,28 @@
77
class TestHPEndpointBase(unittest.TestCase):
88
def setUp(self):
99
self.base = HPEndpointBase()
10+
11+
@patch("sagemaker.hyperpod.inference.hp_endpoint_base.verify_kubernetes_version_compatibility")
12+
@patch("kubernetes.config.load_kube_config")
13+
def test_verify_kube_config(self, mock_load_kube_config, mock_verify_k8s_version):
14+
# Reset the class variable
15+
HPEndpointBase.is_kubeconfig_loaded = False
16+
17+
# Call the method
18+
HPEndpointBase.verify_kube_config()
19+
20+
# Verify both functions were called
21+
mock_load_kube_config.assert_called_once()
22+
mock_verify_k8s_version.assert_called_once_with(HPEndpointBase.get_logger())
23+
24+
# Reset mocks
25+
mock_load_kube_config.reset_mock()
26+
mock_verify_k8s_version.reset_mock()
27+
28+
# Call again - should not call the functions
29+
HPEndpointBase.verify_kube_config()
30+
mock_load_kube_config.assert_not_called()
31+
mock_verify_k8s_version.assert_not_called()
1032

1133
@patch("kubernetes.client.CustomObjectsApi")
1234
@patch.object(HPEndpointBase, "verify_kube_config")

0 commit comments

Comments
 (0)