fix unit test failure and fix bug around versioning

Narrohag · Narrohag · commit 622f70681913 · 2025-03-06T19:32:36.000Z
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -2511,6 +2511,7 @@ def start_new(cls, estimator, inputs, experiment_config):
         train_args = cls._get_train_args(estimator, inputs, experiment_config)
 
         logger.debug("Train args after processing defaults: %s", train_args)
+        print("rohan debug: ", train_args)
         estimator.sagemaker_session.train(**train_args)
 
         return cls(estimator.sagemaker_session, estimator._current_job_name)
diff --git a/src/sagemaker/jumpstart/hub/interfaces.py b/src/sagemaker/jumpstart/hub/interfaces.py
@@ -630,7 +630,6 @@ def from_json(self, json_obj: Dict[str, Any]) -> None:
             if json_obj.get("ValidationSupported")
             else None
         )
-        self.default_training_dataset_uri: Optional[str] = json_obj.get("DefaultTrainingDatasetUri")
         self.resource_name_base: Optional[str] = json_obj.get("ResourceNameBase")
         self.gated_bucket: bool = bool(json_obj.get("GatedBucket", False))
         self.default_payloads: Optional[Dict[str, JumpStartSerializablePayload]] = (
@@ -671,6 +670,7 @@ def from_json(self, json_obj: Dict[str, Any]) -> None:
         )
 
         if self.training_supported:
+            self.default_training_dataset_uri: Optional[str] = json_obj.get("DefaultTrainingDatasetUri")
             self.training_model_package_artifact_uri: Optional[str] = json_obj.get(
                 "TrainingModelPackageArtifactUri"
             )
diff --git a/src/sagemaker/jumpstart/hub/utils.py b/src/sagemaker/jumpstart/hub/utils.py
@@ -22,6 +22,7 @@
 from sagemaker.jumpstart.types import HubContentType, HubArnExtractedInfo
 from sagemaker.jumpstart import constants
 from packaging.specifiers import SpecifierSet, InvalidSpecifier
+from packaging import version
 
 PROPRIETARY_VERSION_KEYWORD = "@marketplace-version:"
 
@@ -219,9 +220,7 @@ def get_hub_model_version(
         sagemaker_session = constants.DEFAULT_JUMPSTART_SAGEMAKER_SESSION
 
     try:
-        hub_content_summaries = sagemaker_session.list_hub_content_versions(
-            hub_name=hub_name, hub_content_name=hub_model_name, hub_content_type=hub_model_type
-        ).get("HubContentSummaries")
+        hub_content_summaries = _list_hub_content_versions_helper(hub_name=hub_name, hub_content_name=hub_model_name, hub_content_type=hub_model_type, sagemaker_session=sagemaker_session)
     except Exception as ex:
         raise Exception(f"Failed calling list_hub_content_versions: {str(ex)}")
 
@@ -237,14 +236,26 @@ def get_hub_model_version(
             return marketplace_hub_content_version
         raise
 
+def _list_hub_content_versions_helper(hub_name, hub_content_name, hub_content_type, sagemaker_session):
+    all_hub_content_summaries = []
+    list_hub_content_versions_response = sagemaker_session.list_hub_content_versions(
+        hub_name=hub_name, hub_content_name=hub_content_name, hub_content_type=hub_content_type
+    )
+    all_hub_content_summaries.extend(list_hub_content_versions_response.get("HubContentSummaries"))
+    while "NextToken" in list_hub_content_versions_response:
+        list_hub_content_versions_response = sagemaker_session.list_hub_content_versions(
+        hub_name=hub_name, hub_content_name=hub_content_name, hub_content_type=hub_content_type, next_token=list_hub_content_versions_response["NextToken"]
+    )
+        all_hub_content_summaries.extend(list_hub_content_versions_response.get("HubContentSummaries"))
+    return all_hub_content_summaries
 
 def _get_hub_model_version_for_open_weight_version(
     hub_content_summaries: List[Any], hub_model_version: Optional[str] = None
 ) -> str:
     available_model_versions = [model.get("HubContentVersion") for model in hub_content_summaries]
 
     if hub_model_version == "*" or hub_model_version is None:
-        return str(max(available_model_versions))
+        return str(max(version.parse(v) for v in available_model_versions))
 
     try:
         spec = SpecifierSet(f"=={hub_model_version}")
diff --git a/src/sagemaker/jumpstart/types.py b/src/sagemaker/jumpstart/types.py
@@ -1940,11 +1940,6 @@ def use_inference_script_uri(self) -> bool:
 
     def use_training_model_artifact(self) -> bool:
         """Returns True if the model should use a model uri when kicking off training job."""
-        # gated model never use training model artifact
-        if self.gated_bucket:
-            return False
-
-        # otherwise, return true is a training model package is not set
         return len(self.training_model_package_artifact_uris or {}) == 0
 
     def is_gated_model(self) -> bool:
diff --git a/tests/integ/sagemaker/jumpstart/private_hub/estimator/test_jumpstart_private_hub_estimator.py b/tests/integ/sagemaker/jumpstart/private_hub/estimator/test_jumpstart_private_hub_estimator.py
@@ -60,17 +60,12 @@ def add_model_references():
 
 
 def test_jumpstart_hub_estimator(setup, add_model_references):
-
     model_id, model_version = "huggingface-spc-bert-base-cased", "*"
 
-    sagemaker_session = get_sm_session()
-
     estimator = JumpStartEstimator(
         model_id=model_id,
-        role=sagemaker_session.get_caller_identity_arn(),
-        sagemaker_session=sagemaker_session,
-        tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
         hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
+        tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
     )
 
     estimator.fit(
@@ -85,22 +80,20 @@ def test_jumpstart_hub_estimator(setup, add_model_references):
         training_job_name=estimator.latest_training_job.name,
         model_id=model_id,
         model_version=model_version,
-        sagemaker_session=get_sm_session(),
     )
 
     # uses ml.p3.2xlarge instance
     predictor = estimator.deploy(
         tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
-        role=get_sm_session().get_caller_identity_arn(),
-        sagemaker_session=get_sm_session(),
     )
 
     response = predictor.predict(["hello", "world"])
 
     assert response is not None
 
 
-def test_jumpstart_hub_estimator_with_default_session(setup, add_model_references):
+def test_jumpstart_hub_estimator_with_session(setup, add_model_references):
+
     model_id, model_version = "huggingface-spc-bert-base-cased", "*"
 
     sagemaker_session = get_sm_session()
@@ -125,12 +118,14 @@ def test_jumpstart_hub_estimator_with_default_session(setup, add_model_reference
         training_job_name=estimator.latest_training_job.name,
         model_id=model_id,
         model_version=model_version,
+        sagemaker_session=get_sm_session(),
     )
 
     # uses ml.p3.2xlarge instance
     predictor = estimator.deploy(
         tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
         role=get_sm_session().get_caller_identity_arn(),
+        sagemaker_session=get_sm_session(),
     )
 
     response = predictor.predict(["hello", "world"])
@@ -144,9 +139,8 @@ def test_jumpstart_hub_gated_estimator_with_eula(setup, add_model_references):
 
     estimator = JumpStartEstimator(
         model_id=model_id,
-        role=get_sm_session().get_caller_identity_arn(),
-        sagemaker_session=get_sm_session(),
         hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
+        tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
     )
 
     estimator.fit(
@@ -161,14 +155,11 @@ def test_jumpstart_hub_gated_estimator_with_eula(setup, add_model_references):
         training_job_name=estimator.latest_training_job.name,
         model_id=model_id,
         model_version=model_version,
-        sagemaker_session=get_sm_session(),
     )
 
     # uses ml.p3.2xlarge instance
     predictor = estimator.deploy(
         tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
-        role=get_sm_session().get_caller_identity_arn(),
-        sagemaker_session=get_sm_session(),
     )
 
     response = predictor.predict(["hello", "world"])
@@ -182,9 +173,8 @@ def test_jumpstart_hub_gated_estimator_without_eula(setup, add_model_references)
 
     estimator = JumpStartEstimator(
         model_id=model_id,
-        role=get_sm_session().get_caller_identity_arn(),
-        sagemaker_session=get_sm_session(),
         hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
+        tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
     )
     with pytest.raises(Exception):
         estimator.fit(
diff --git a/tests/unit/sagemaker/jumpstart/constants.py b/tests/unit/sagemaker/jumpstart/constants.py
@@ -15553,6 +15553,8 @@
     },
     "inference_enable_network_isolation": True,
     "training_enable_network_isolation": True,
+    "default_training_dataset_uri": None,
+    "default_training_dataset_key": "training-datasets/tf_flowers/",
     "resource_name_base": "pt-ic-mobilenet-v2",
     "hosting_eula_key": None,
     "hosting_model_package_arns": {},
diff --git a/tests/unit/sagemaker/jumpstart/test_types.py b/tests/unit/sagemaker/jumpstart/test_types.py
@@ -378,6 +378,7 @@ def test_jumpstart_model_specs():
         specs1.training_script_key
         == "source-directory-tarballs/pytorch/transfer_learning/ic/v2.3.0/sourcedir.tar.gz"
     )
+    assert specs1.default_training_dataset_key == "training-datasets/tf_flowers/"
     assert specs1.hyperparameters == [
         JumpStartHyperparameter(
             {

Original file line number	Diff line number	Diff line change
`@@ -630,7 +630,6 @@ def from_json(self, json_obj: Dict[str, Any]) -> None:`
`630`	`630`	`if json_obj.get("ValidationSupported")`
`631`	`631`	`else None`
`632`	`632`	`)`
`633`		`- self.default_training_dataset_uri: Optional[str] = json_obj.get("DefaultTrainingDatasetUri")`
`634`	`633`	`self.resource_name_base: Optional[str] = json_obj.get("ResourceNameBase")`
`635`	`634`	`self.gated_bucket: bool = bool(json_obj.get("GatedBucket", False))`
`636`	`635`	`self.default_payloads: Optional[Dict[str, JumpStartSerializablePayload]] = (`
`@@ -671,6 +670,7 @@ def from_json(self, json_obj: Dict[str, Any]) -> None:`
`671`	`670`	`)`
`672`	`671`
`673`	`672`	`if self.training_supported:`
	`673`	`+ self.default_training_dataset_uri: Optional[str] = json_obj.get("DefaultTrainingDatasetUri")`
`674`	`674`	`self.training_model_package_artifact_uri: Optional[str] = json_obj.get(`
`675`	`675`	`"TrainingModelPackageArtifactUri"`
`676`	`676`	`)`
Original file line number	Diff line number	Diff line change
`@@ -378,6 +378,7 @@ def test_jumpstart_model_specs():`
`378`	`378`	`specs1.training_script_key`
`379`	`379`	`== "source-directory-tarballs/pytorch/transfer_learning/ic/v2.3.0/sourcedir.tar.gz"`
`380`	`380`	`)`
	`381`	`+ assert specs1.default_training_dataset_key == "training-datasets/tf_flowers/"`
`381`	`382`	`assert specs1.hyperparameters == [`
`382`	`383`	`JumpStartHyperparameter(`
`383`	`384`	`{`