Merge branch 'master' into fw-and-version-bug

aviruthen · web-flow · commit a4352a8e22c5 · 2025-12-17T12:22:04.000-05:00
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,37 @@
+---
+name: Bug report
+about: File a report to help us reproduce and fix the problem
+title: ''
+labels: 'bug'
+assignees: ''
+
+---
+
+**PySDK Version**
+- [ ] PySDK V2 (2.x)
+- [ ] PySDK V3 (3.x)
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To reproduce**
+A clear, step-by-step set of instructions to reproduce the bug.
+The provided code need to be **complete** and **runnable**, if additional data is needed, please include them in the issue.
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots or logs**
+If applicable, add screenshots or logs to help explain your problem.
+
+**System information**
+A description of your system. Please provide:
+- **SageMaker Python SDK version**:
+- **Framework name (eg. PyTorch) or algorithm (eg. KMeans)**:
+- **Framework version**:
+- **Python version**:
+- **CPU or GPU**:
+- **Custom Docker image (Y/N)**:
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Ask a question
+    url: https://github.com/aws/sagemaker-python-sdk/discussions
+    about: Use GitHub Discussions to ask and answer questions
diff --git a/.github/ISSUE_TEMPLATE/documentation-request.md b/.github/ISSUE_TEMPLATE/documentation-request.md
@@ -0,0 +1,17 @@
+---
+name: Documentation request
+about: Request improved documentation
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**What did you find confusing? Please describe.**
+A clear and concise description of what you found confusing. Ex. I tried to [...] but I didn't understand how to [...]
+
+**Describe how documentation can be improved**
+A clear and concise description of where documentation was lacking and how it can be improved.
+
+**Additional context**
+Add any other context or screenshots about the documentation request here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest new functionality for this library
+title: ''
+labels: 'feature request'
+assignees: ''
+
+---
+
+**Describe the feature you'd like**
+A clear and concise description of the functionality you want.
+
+**How would this feature be used? Please describe.**
+A clear and concise description of the use case for this feature. Please provide an example, if possible.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/sagemaker-core/src/sagemaker/core/local/utils.py b/sagemaker-core/src/sagemaker/core/local/utils.py
@@ -137,7 +137,11 @@ def get_child_process_ids(pid):
     Returns:
         (List[int]): Child process ids
     """
-    cmd = f"pgrep -P {pid}".split()
+    if not str(pid).isdigit():
+        raise ValueError("Invalid PID")
+    
+    cmd = ["pgrep", "-P", str(pid)]
+    
     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     output, err = process.communicate()
     if err:
diff --git a/sagemaker-core/tests/unit/local/test_local_utils.py b/sagemaker-core/tests/unit/local/test_local_utils.py
@@ -103,21 +103,24 @@ def test_recursive_copy(copy_tree, m_os_path):
 @patch("sagemaker.core.local.utils.os")
 @patch("sagemaker.core.local.utils.get_child_process_ids")
 def test_kill_child_processes(m_get_child_process_ids, m_os):
-    m_get_child_process_ids.return_value = ["child_pids"]
-    kill_child_processes("pid")
-    m_os.kill.assert_called_with("child_pids", 15)
+    m_get_child_process_ids.return_value = ["345"]
+    kill_child_processes("123")
+    m_os.kill.assert_called_with("345", 15)
 
 
 @patch("sagemaker.core.local.utils.subprocess")
 def test_get_child_process_ids(m_subprocess):
-    cmd = "pgrep -P pid".split()
+    cmd = "pgrep -P 123".split()
     process_mock = Mock()
     attrs = {"communicate.return_value": (b"\n", False), "returncode": 0}
     process_mock.configure_mock(**attrs)
     m_subprocess.Popen.return_value = process_mock
-    get_child_process_ids("pid")
+    get_child_process_ids("123")
     m_subprocess.Popen.assert_called_with(cmd, stdout=m_subprocess.PIPE, stderr=m_subprocess.PIPE)
 
+def test_get_child_process_ids_exception():
+    with pytest.raises(ValueError, match="Invalid PID"):
+        get_child_process_ids("abc")
 
 @patch("sagemaker.core.local.utils.subprocess")
 def test_get_docker_host(m_subprocess):
diff --git a/sagemaker-train/src/sagemaker/train/common_utils/model_resolution.py b/sagemaker-train/src/sagemaker/train/common_utils/model_resolution.py
@@ -13,6 +13,8 @@
 from dataclasses import dataclass
 from enum import Enum
 import re
+from sagemaker.train.base_trainer import BaseTrainer
+from sagemaker.core.utils.utils import Unassigned
 
 
 class _ModelType(Enum):
@@ -65,14 +67,14 @@ def __init__(self, sagemaker_session=None):
     
     def resolve_model_info(
         self, 
-        base_model: Union[str, 'ModelPackage'],
+        base_model: Union[str, BaseTrainer, 'ModelPackage'],
         hub_name: Optional[str] = None
     ) -> _ModelInfo:
         """
         Resolve model information from various input types.
         
         Args:
-            base_model: Either a JumpStart model ID (str) or ModelPackage object/ARN
+            base_model: Either a JumpStart model ID (str) or ModelPackage object/ARN or BaseTrainer object with a completed job
             hub_name: Optional hub name for JumpStart models (defaults to SageMakerPublicHub)
         
         Returns:
@@ -88,6 +90,17 @@ def resolve_model_info(
                 return self._resolve_model_package_arn(base_model)
             else:
                 return self._resolve_jumpstart_model(base_model, hub_name or self.DEFAULT_HUB_NAME)
+        # Handle BaseTrainer type
+        elif isinstance(base_model, BaseTrainer):
+            if hasattr(base_model, '_latest_training_job') and hasattr(base_model._latest_training_job,
+                                                              'output_model_package_arn'):
+                arn = base_model._latest_training_job.output_model_package_arn
+                if not isinstance(arn, Unassigned):
+                    return self._resolve_model_package_arn(arn)
+                else:
+                    raise ValueError("BaseTrainer must have completed training job to be used for evaluation")
+            else:
+                raise ValueError("BaseTrainer must have completed training job to be used for evaluation")
         else:
             # Not a string, so assume it's a ModelPackage object
             # Check if it has the expected attributes of a ModelPackage
diff --git a/sagemaker-train/src/sagemaker/train/evaluate/base_evaluator.py b/sagemaker-train/src/sagemaker/train/evaluate/base_evaluator.py
@@ -13,12 +13,13 @@
 
 from pydantic import BaseModel, validator
 
-from sagemaker.core.resources import ModelPackageGroup
+from sagemaker.core.resources import ModelPackageGroup, ModelPackage
 from sagemaker.core.shapes import VpcConfig
 
 if TYPE_CHECKING:
     from sagemaker.core.helper.session_helper import Session
 
+from sagemaker.train.base_trainer import BaseTrainer
 # Module-level logger
 _logger = logging.getLogger(__name__)
 
@@ -53,6 +54,7 @@ class BaseEvaluator(BaseModel):
             - JumpStart model ID (str): e.g., 'llama3-2-1b-instruct'
             - ModelPackage object: A fine-tuned model package
             - ModelPackage ARN (str): e.g., 'arn:aws:sagemaker:region:account:model-package/name/version'
+            - BaseTrainer object: A completed training job (i.e., it must have _latest_training_job with output_model_package_arn populated)
         base_eval_name (Optional[str]): Optional base name for evaluation jobs. This name is used
             as the PipelineExecutionDisplayName when creating the SageMaker pipeline execution.
             The actual display name will be "{base_eval_name}-{timestamp}". This parameter can
@@ -86,7 +88,7 @@ class BaseEvaluator(BaseModel):
     
     region: Optional[str] = None
     sagemaker_session: Optional[Any] = None
-    model: Union[str, Any]
+    model: Union[str, BaseTrainer, ModelPackage]
     base_eval_name: Optional[str] = None
     s3_output_path: str
     mlflow_resource_arn: Optional[str] = None
@@ -278,7 +280,7 @@ def _validate_mlflow_arn_format(cls, v: Optional[str]) -> Optional[str]:
         return v
     
     @validator('model')
-    def _resolve_model_info(cls, v: Union[str, Any], values: dict) -> Union[str, Any]:
+    def _resolve_model_info(cls, v: Union[str, BaseTrainer, ModelPackage], values: dict) -> Union[str, Any]:
         """Resolve model information from various input types.
         
         This validator uses the common model resolution utility to extract:
@@ -289,7 +291,7 @@ def _resolve_model_info(cls, v: Union[str, Any], values: dict) -> Union[str, Any
         The resolved information is stored in private attributes for use by subclasses.
         
         Args:
-            v (Union[str, Any]): Model identifier (JumpStart ID, ModelPackage, or ARN).
+            v (Union[str, BaseTrainer, ModelPackage]): Model identifier (JumpStart ID, ModelPackage, ARN, or BaseTrainer).
             values (dict): Dictionary of already-validated fields.
             
         Returns:
diff --git a/sagemaker-train/src/sagemaker/train/evaluate/benchmark_evaluator.py b/sagemaker-train/src/sagemaker/train/evaluate/benchmark_evaluator.py
@@ -300,18 +300,10 @@ class BenchMarkEvaluator(BaseEvaluator):
     """
     
     benchmark: _Benchmark
-    dataset: Union[str, Any]  # Required field, must come before optional fields
     subtasks: Optional[Union[str, List[str]]] = None
     evaluate_base_model: bool = True
     _hyperparameters: Optional[Any] = None
-    
-    @validator('dataset', pre=True)
-    def _resolve_dataset(cls, v):
-        """Resolve dataset to string (S3 URI or ARN) and validate format.
-        
-        Uses BaseEvaluator's common validation logic to avoid code duplication.
-        """
-        return BaseEvaluator._validate_and_resolve_dataset(v)
+
     
     @validator('benchmark')
     def _validate_benchmark_model_compatibility(cls, v, values):
diff --git a/sagemaker-train/src/sagemaker/train/rlaif_trainer.py b/sagemaker-train/src/sagemaker/train/rlaif_trainer.py
@@ -286,7 +286,7 @@ def train(self, training_dataset: Optional[Union[str, DataSet]] = None, validati
             except TimeoutExceededError as e:
                 logger.error("Error: %s", e)
 
-        self.latest_training_job = training_job
+        self._latest_training_job = training_job
         return training_job
 
     def _process_hyperparameters(self):
diff --git a/sagemaker-train/src/sagemaker/train/rlvr_trainer.py b/sagemaker-train/src/sagemaker/train/rlvr_trainer.py
@@ -274,5 +274,5 @@ def train(self, training_dataset: Optional[Union[str, DataSet]] = None,
             except TimeoutExceededError as e:
                 logger.error("Error: %s", e)
 
-        self.latest_training_job = training_job
+        self._latest_training_job = training_job
         return training_job
diff --git a/sagemaker-train/src/sagemaker/train/sft_trainer.py b/sagemaker-train/src/sagemaker/train/sft_trainer.py
@@ -268,7 +268,7 @@ def train(self, training_dataset: Optional[Union[str, DataSet]] = None, validati
             except TimeoutExceededError as e:
                 logger.error("Error: %s", e)
 
-        self.latest_training_job = training_job
+        self._latest_training_job = training_job
         return training_job
 
 
diff --git a/sagemaker-train/tests/unit/train/common_utils/test_model_resolution.py b/sagemaker-train/tests/unit/train/common_utils/test_model_resolution.py
@@ -24,6 +24,8 @@
     _ModelResolver,
     _resolve_base_model,
 )
+from sagemaker.train.base_trainer import BaseTrainer
+from sagemaker.core.utils.utils import Unassigned
 
 
 class TestModelType:
@@ -557,3 +559,74 @@ def test_resolve_base_model_with_hub_name(self, mock_resolver_class):
         _resolve_base_model("test-model", hub_name="CustomHub")
         
         mock_resolver.resolve_model_info.assert_called_once_with("test-model", "CustomHub")
+
+
+class TestBaseTrainerHandling:
+    """Tests for BaseTrainer model handling in _resolve_base_model."""
+    
+    def test_base_trainer_with_valid_training_job(self):
+        """Test BaseTrainer with valid completed training job."""
+        # Create concrete BaseTrainer subclass for testing
+        class TestTrainer(BaseTrainer):
+            def train(self, input_data_config, wait=True, logs=True):
+                pass
+        
+        mock_trainer = TestTrainer()
+        mock_training_job = MagicMock()
+        mock_training_job.output_model_package_arn = "arn:aws:sagemaker:us-west-2:123456789012:model-package/my-package/1"
+        mock_trainer._latest_training_job = mock_training_job
+        
+        with patch('sagemaker.train.common_utils.model_resolution._ModelResolver._resolve_model_package_arn') as mock_resolve_arn:
+            mock_resolve_arn.return_value = MagicMock()
+            
+            result = _resolve_base_model(mock_trainer)
+            
+            # Verify model package ARN resolution was called
+            mock_resolve_arn.assert_called_once_with(
+                "arn:aws:sagemaker:us-west-2:123456789012:model-package/my-package/1"
+            )
+    
+    def test_base_trainer_with_unassigned_arn(self):
+        """Test BaseTrainer with Unassigned output_model_package_arn raises error."""
+        # Create concrete BaseTrainer subclass for testing
+        class TestTrainer(BaseTrainer):
+            def train(self, input_data_config, wait=True, logs=True):
+                pass
+        
+        mock_trainer = TestTrainer()
+        mock_training_job = MagicMock()
+        mock_training_job.output_model_package_arn = Unassigned()
+        mock_trainer._latest_training_job = mock_training_job
+        
+        with pytest.raises(ValueError, match="BaseTrainer must have completed training job"):
+            _resolve_base_model(mock_trainer)
+    
+    def test_base_trainer_without_training_job(self):
+        """Test BaseTrainer without _latest_training_job raises error."""
+        # Create concrete BaseTrainer subclass for testing
+        class TestTrainer(BaseTrainer):
+            def train(self, input_data_config, wait=True, logs=True):
+                pass
+        
+        mock_trainer = TestTrainer()
+        # Don't set _latest_training_job attribute at all
+        
+        with pytest.raises(ValueError, match="BaseTrainer must have completed training job"):
+            _resolve_base_model(mock_trainer)
+    
+    def test_base_trainer_without_output_model_package_arn_attribute(self):
+        """Test BaseTrainer with training job but missing output_model_package_arn attribute."""
+        # Create concrete BaseTrainer subclass for testing
+        class TestTrainer(BaseTrainer):
+            def train(self, input_data_config, wait=True, logs=True):
+                pass
+        
+        # Create a simple object without output_model_package_arn
+        class TrainingJobWithoutArn:
+            pass
+        
+        mock_trainer = TestTrainer()
+        mock_trainer._latest_training_job = TrainingJobWithoutArn()
+        
+        with pytest.raises(ValueError, match="BaseTrainer must have completed training job"):
+            _resolve_base_model(mock_trainer)
diff --git a/sagemaker-train/tests/unit/train/evaluate/test_base_evaluator.py b/sagemaker-train/tests/unit/train/evaluate/test_base_evaluator.py
@@ -20,6 +20,8 @@
 from sagemaker.core.shapes import VpcConfig
 from sagemaker.core.resources import ModelPackageGroup, Artifact
 from sagemaker.core.shapes import ArtifactSource, ArtifactSourceType
+from sagemaker.core.utils.utils import Unassigned
+from sagemaker.train.base_trainer import BaseTrainer
 
 from sagemaker.train.evaluate.base_evaluator import BaseEvaluator
 
diff --git a/sagemaker-train/tests/unit/train/evaluate/test_benchmark_evaluator.py b/sagemaker-train/tests/unit/train/evaluate/test_benchmark_evaluator.py