aws
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.pydocstylerc‎
Lines changed: 1 addition & 0 deletions b/‎.pydocstylerc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 148 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 148 additions & 0 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 2 additions & 0 deletions b/‎MANIFEST.in‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/api/training/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎doc/api/training/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/api/training/model_trainer.rst‎
Lines changed: 17 additions & 0 deletions b/‎doc/api/training/model_trainer.rst‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎doc/frameworks/pytorch/using_pytorch.rst‎
Lines changed: 125 additions & 7 deletions b/‎doc/frameworks/pytorch/using_pytorch.rst‎
Lines changed: 125 additions & 7 deletions
@@ -32,6 +32,9 @@ env/
 .python-version
 *.html
 **/_repack_script_launcher.sh
+src/sagemaker/modules/train/container_drivers/sm_train.sh
+src/sagemaker/modules/train/container_drivers/sourcecode.json
+src/sagemaker/modules/train/container_drivers/distributed.json
 tests/data/**/_repack_model.py
 tests/data/experiment/sagemaker-dev-1.0.tar.gz
 src/sagemaker/serve/tmp_workspace
@@ -2,3 +2,4 @@
 inherit = false
 ignore = D104,D107,D202,D203,D213,D214,D400,D401,D404,D406,D407,D411,D413,D414,D415,D417
 match = (?!record_pb2).*\.py
+match-dir = (?!.*test).*
@@ -1,5 +1,153 @@
 # Changelog
 
+## v2.237.2 (2024-12-17)
+
+### Bug Fixes and Other Changes
+
+ * update image_uri_configs  12-13-2024 17:07:12 PST
+ * Cloudpickle upgrade
+
+## v2.237.1 (2024-12-12)
+
+### Bug Fixes and Other Changes
+
+ * chore: remove support for ecr spec fallbacks for jumpstart models
+ * Cloudpickle Revert
+ * Cloudpickle update
+ * Numpy update
+ * Protobuf update
+ * Update to fetch latest Cloudpickle version
+
+## v2.237.0 (2024-12-05)
+
+### Features
+
+ * Support SageMakerTrainingPlan for training jobs
+ * AMI support for BRM
+ * Adding Bedrock Store model support for HubService
+
+### Bug Fixes and Other Changes
+
+ * Fix unit tests
+ * update boto3 and sagemaker-core version
+ * fix gpu_image uri
+ * Hotfix to construct rubik uri correctly
+ * fix codestyles
+ * fix merge artifact
+ * fix merge artifact
+ * fix test_requiremenets.txt
+ * chore: Merge from main
+
+## v2.236.0 (2024-12-04)
+
+### Features
+
+ * Partner App Auth Provider for SDK support
+ * add pre-processing and post-processing logic to inference_spec
+ * add utility function to capture local snapshot
+ * support script mode with local train.sh
+
+### Bug Fixes and Other Changes
+
+ * Add graphene to doc requirements
+ * Add graphne to the doc requirements
+ * Enable the Recipe tests marked with @pytest.mark.skip(reason="Hyperpod recipe code unavailable"
+ * Add model trainer documentation
+ * Usage docs for training recipes
+ * Neuron URIs update
+ * Update URIs to public for training recipes
+ * Changes for SMP v2.7.0
+ * Change default source directory to current, add option to specify source dir
+ * Remove default values for fields in recipe_overrides and fix recipe path.
+ * Update MANIFEST.in so that wheel builds correctly
+ * fix the file uploading signature verification error
+ * remove example notebooks artifacts
+ * Morpheus tests
+ * Integ tests for local mode model trainer
+ * Update hyperpod recipe uris
+ * Add interface units for ModelTrainer
+ * Model Trainer Bucket improvements
+ * Update ModelTrainer Interface Parameters
+ * add in-process mode definition to docs
+ * Intelligent defaults for Model Trainer
+ * Fix tests and codestyle
+ * add integ test for base_model_builder_deploy and remove print statement
+ * Revert image builder
+ * pin xgboost dlc to 1.7.1 to fix test
+ * Skip JS model mapping with env vars or image URI provided
+ * Use sagemaker core Session
+ * Integration tests for Model Builder Handshake
+ * [Updated] Add telemetry to ModelTrainer, Estimator and ModelBuilder
+ * Update kandinsky in ModelTrainer and allow setting requirements
+ * add modelID support to model builder InProcess model
+ * Add Rich Logging to Model Builder
+ * Notebooks update for Bugbash
+ * Add bugbash bootstrapping
+ * add inference morpheus nbs
+ * Update ModelTrainer Notebooks
+ * Bug fixes
+ * Single container local training
+ * update notebooks
+ * update notebooks
+ * Add recipes examples
+ * Unified Deployment interface in Model Builder
+ * Use exact python path in trainer template
+ * Support building image from Dockerfile
+ * Add Support for Training Recipes
+ * Trainer handshake
+ * Pass hyperparameters as CLI args
+ * Add in_process mode support for DJL and TorchServe servers
+ * Remove ignored files
+ * Simplify Config Class Names and DistributedRunner structures
+ * Fix bug in script mode setup ModelTrainer
+ * Mask Sensitive Env Logs in Container
+ * Add path to set Additional Settings in ModelTrainer
+ * Add Distributed Training Support Model Trainer
+ * Cleanup ModelTrainer code
+ * Latest Container Image
+ * General image builder
+ * Cleanup ModelTrainer
+ * Revert Image Spec
+ * Support intelligent parameters
+ * Add enviornment variable bootstrapping script
+ * Add example notebook
+ * Add unit tests for ModelTrainer
+ * Image Spec refactoring and updates
+ * Base model trainer
+
+## v2.235.2 (2024-11-22)
+
+## v2.235.1 (2024-11-20)
+
+### Bug Fixes and Other Changes
+
+ * Update sagemaker-core dep
+ * update image_uri_configs  11-20-2024 06:17:41 PST
+
+## v2.235.0 (2024-11-19)
+
+### Features
+
+ * Optimize() validations across TRT, VLLM, Neuron container optimizations
+
+### Bug Fixes and Other Changes
+
+ * update image_uri_configs  11-19-2024 06:17:58 PST
+
+## v2.234.0 (2024-11-19)
+
+### Features
+
+ * optimization technique related validations.
+
+### Bug Fixes and Other Changes
+
+ * Revert "change: add TGI 2.4.0 image uri (#4922)"
+ * pin testing deps
+ * add TGI 2.4.0 image uri
+ * add jumpstart ap-southeast-5
+ * Move sagemaker-mlflow to extras
+
 ## v2.233.0 (2024-11-04)
 
 ### Features
 
@@ -1,8 +1,10 @@
 recursive-include src/sagemaker *.py
 
 include src/sagemaker/image_uri_config/*.json
+include src/sagemaker/pytorch/training_recipes.json
 include src/sagemaker/serve/schema/*.json
 include src/sagemaker/serve/requirements.txt
+include src/sagemaker/modules/train/sm_recipes/training_recipes.json
 recursive-include requirements *
 
 include VERSION
 
@@ -1 +1 @@
-2.233.1.dev0
+2.237.3.dev0
@@ -5,6 +5,7 @@ Training APIs
 .. toctree::
    :maxdepth: 4
 
+   model_trainer
    algorithm
    analytics
    automl
 
@@ -0,0 +1,17 @@
+ModelTrainer
+------------
+
+.. autoclass:: sagemaker.modules.train.model_trainer.ModelTrainer
+    :members:
+
+Configs
+~~~~~~~
+
+.. automodule:: sagemaker.modules.configs
+    :members:
+
+Distributed
+~~~~~~~~~~~
+
+.. automodule:: sagemaker.modules.distributed
+    :members:
@@ -21,12 +21,9 @@ To train a PyTorch model by using the SageMaker Python SDK:
 .. |create pytorch estimator| replace:: Create a ``sagemaker.pytorch.PyTorch`` Estimator
 .. _create pytorch estimator: #create-an-estimator
 
-.. |call fit| replace:: Call the estimator's ``fit`` method
-.. _call fit: #call-the-fit-method
-
-1. `Prepare a training script <#prepare-a-pytorch-training-script>`_
+1. `Prepare a training script <#prepare-a-pytorch-training-script>`_ OR `Choose an Amazon SageMaker HyperPod recipe`_
 2. |create pytorch estimator|_
-3. |call fit|_
+3. `Call the estimator's fit method or ModelTrainer's train method`_
 
 Prepare a PyTorch Training Script
 =================================
@@ -175,6 +172,16 @@ see `AWS Deep Learning Containers <https://github.com/aws/deep-learning-containe
 - `Images for HuggingFace <https://github.com/aws/deep-learning-containers/tree/master/huggingface>`__
 
 
+Choose an Amazon Sagemaker HyperPod recipe
+==========================================
+
+Alternatively, instead of using your own training script, you can choose an
+`Amazon SageMaker HyperPod recipe <https://github.com/aws/sagemaker-hyperpod-recipes>`_ to launch training for a supported model.
+If using a recipe, you do not need to provide your own training script. You only need to determine
+which recipe you want to run. You can modify a recipe as explained in the next section.
+
+
+
 Create an Estimator
 ===================
 
@@ -196,10 +203,121 @@ directories ('train' and 'test').
                            'test': 's3://my-data-bucket/path/to/my/test/data'})
 
 
+Amazon Sagemaker HyperPod recipes
+---------------------------------
+Alternatively, if you are using Amazon SageMaker HyperPod recipes, you can follow the following instructions:
 
+Prerequisites: you need ``git`` installed on your client to access Amazon SageMaker HyperPod recipes code.
 
-Call the fit Method
-===================
+When using a recipe, you must set the ``training_recipe`` arg in place of providing a training script.
+This can be a recipe from `here <https://github.com/aws/sagemaker-hyperpod-recipes>`_
+or a local file or a custom url.  Please note that you must override the following using
+``recipe_overrides``:
+
+* directory paths for the local container in the recipe as appropriate for Python SDK
+* the output s3 URIs
+* Huggingface access token
+* any other recipe fields you wish to edit
+
+The code snippet below shows an example.
+Please refer to `SageMaker docs <https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html>`_
+for more details about the expected local paths in the container and the Amazon SageMaker
+HyperPod recipes tutorial for more examples.
+You can override the fields by either setting ``recipe_overrides`` or
+providing a modified ``training_recipe`` through a local file or a custom url.
+When using the recipe, any provided ``entry_point`` will be ignored.
+
+SageMaker will automatically set up the distribution args.
+It will also determine the image to use for your model and device type,
+but you can override this with the ``image_uri`` arg.
+
+You can also override the number of nodes in the recipe with the ``instance_count`` arg to estimator.
+``source_dir`` will default to current working directory unless specified.
+A local copy of training scripts and recipe will be saved in the ``source_dir``.
+You can specify any additional packages you want to install for training in an optional ``requirements.txt`` in the ``source_dir``.
+
+Note for llama3.2 multi-modal models, you need to upgrade transformers library by providing a ``requirements.txt`` in the source file with ``transformers==4.45.2``.
+Please refer to the Amazon SageMaker HyperPod recipes documentation for more details.
+
+
+Here is an example usage for recipe ``hf_llama3_8b_seq8k_gpu_p5x16_pretrain``.
+
+
+.. code:: python
+
+    overrides = {
+        "run": {
+            "results_dir": "/opt/ml/model",
+        },
+        "exp_manager": {
+            "exp_dir": "",
+            "explicit_log_dir": "/opt/ml/output/tensorboard",
+            "checkpoint_dir": "/opt/ml/checkpoints",
+        },
+        "model": {
+            "data": {
+                "train_dir": "/opt/ml/input/data/train",
+                "val_dir": "/opt/ml/input/data/val",
+            },
+        },
+    }
+    pytorch_estimator = PyTorch(
+      output_path=output_path,
+      base_job_name=f"llama-recipe",
+      role=role,
+      instance_type="ml.p5.48xlarge",
+      training_recipe="hf_llama3_8b_seq8k_gpu_p5x16_pretrain",
+      recipe_overrides=recipe_overrides,
+      sagemaker_session=sagemaker_session,
+      tensorboard_output_config=tensorboard_output_config,
+    )
+    pytorch_estimator.fit({'train': 's3://my-data-bucket/path/to/my/training/data',
+                           'test': 's3://my-data-bucket/path/to/my/test/data'})
+
+    # Or alternatively with ModelTrainer
+    recipe_overrides = {
+        "run": {
+            "results_dir": "/opt/ml/model",
+        },
+        "exp_manager": {
+            "exp_dir": "",
+            "explicit_log_dir": "/opt/ml/output/tensorboard",
+            "checkpoint_dir": "/opt/ml/checkpoints",
+        },
+        "model": {
+            "data": {
+                "train_dir": "/opt/ml/input/data/train",
+                "val_dir": "/opt/ml/input/data/val",
+            },
+        },
+    }
+
+    model_trainer = ModelTrainer.from_recipe(
+        output_path=output_path,
+        base_job_name=f"llama-recipe",
+        training_recipe="training/llama/hf_llama3_8b_seq8k_gpu_p5x16_pretrain",
+        recipe_overrides=recipe_overrides,
+        compute=Compute(instance_type="ml.p5.48xlarge"),
+        sagemaker_session=sagemaker_session
+    ).with_tensorboard_output_config(
+        tensorboard_output_config=tensorboard_output_config
+    )
+
+    train_input = Input(
+        channel_name="train",
+        data_source="s3://my-data-bucket/path/to/my/training/data"
+    )
+
+    test_input = Input(
+        channel_name="test",
+        data_source="s3://my-data-bucket/path/to/my/test/data"
+    )
+
+    model_trainer.train(input_data_config=[train_input, test_input)
+
+
+Call the estimator's fit method or ModelTrainer's train method
+==============================================================
 
 You start your training script by calling ``fit`` on a ``PyTorch`` Estimator. ``fit`` takes both required and optional
 arguments.