aws
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎CHANGELOG.md‎
Lines changed: 54 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎requirements/extras/test_requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/extras/test_requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/sagemaker/amtviz/__init__.py‎
Lines changed: 27 additions & 0 deletions b/‎src/sagemaker/amtviz/__init__.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/sagemaker/amtviz/job_metrics.py‎
Lines changed: 180 additions & 0 deletions b/‎src/sagemaker/amtviz/job_metrics.py‎
Lines changed: 180 additions & 0 deletions
@@ -37,4 +37,5 @@ src/sagemaker/modules/train/container_drivers/sourcecode.json
 src/sagemaker/modules/train/container_drivers/distributed.json
 tests/data/**/_repack_model.py
 tests/data/experiment/sagemaker-dev-1.0.tar.gz
-src/sagemaker/serve/tmp_workspace
+src/sagemaker/serve/tmp_workspace
+test-examples
@@ -1,5 +1,59 @@
 # Changelog
 
+## v2.249.0 (2025-07-31)
+
+### Features
+
+ * AWS Batch for SageMaker Training jobs
+
+### Bug Fixes and Other Changes
+
+ * Directly use customer-provided endpoint name for ModelBuilder deployment.
+ * update image_uri_configs 07-23-2025 07:18:25 PST
+
+## v2.248.2 (2025-07-22)
+
+### Bug Fixes and Other Changes
+
+ * Relax boto3 version requirement
+ * update image_uri_configs 07-22-2025 07:18:25 PST
+ * update image_uri_configs 07-18-2025 07:18:28 PST
+ * add hard dependency on sagemaker-core pypi lib
+ * When rootlessDocker is enabled, return a fixed SageMaker IP
+
+## v2.248.1 (2025-07-16)
+
+### Bug Fixes and Other Changes
+
+ * Nova training support
+
+## v2.248.0 (2025-07-15)
+
+### Features
+
+ * integrate amtviz for visualization of tuning jobs
+
+### Bug Fixes and Other Changes
+
+ * build(deps): bump requests in /tests/data/serve_resources/mlflow/pytorch
+ * build(deps): bump protobuf from 4.25.5 to 4.25.8 in /requirements/extras
+ * build(deps): bump mlflow in /tests/data/serve_resources/mlflow/xgboost
+ * build(deps): bump torch in /tests/data/modules/script_mode
+ * sanitize git clone repo input url
+ * Adding Hyperpod feature to enable hyperpod telemetry
+ * Adding Hyperpod feature to enable hyperpod telemetry
+ * Bump SMD version to enable custom workflow deployment.
+ * Update TF DLC python version to py312
+ * update image_uri_configs 07-04-2025 07:18:27 PST
+ * update image_uri_configs 06-26-2025 07:18:35 PST
+ * relax protobuf to <6.32
+
+## v2.247.1 (2025-06-23)
+
+### Bug Fixes and Other Changes
+
+ * update image_uri_configs 06-19-2025 07:18:34 PST
+
 ## v2.247.0 (2025-06-13)
 
 ### Features
 
@@ -1 +1 @@
-2.247.1.dev0
+2.249.1.dev0
@@ -32,7 +32,7 @@ classifiers = [
 ]
 dependencies = [
   "attrs>=24,<26",
-  "boto3>=1.35.75,<2.0",
+  "boto3>=1.35.36,<2.0",
   "cloudpickle>=2.2.1",
   "docker",
   "fastapi",
@@ -45,7 +45,7 @@ dependencies = [
   "pandas",
   "pathos",
   "platformdirs",
-  "protobuf>=3.12,<6.0",
+  "protobuf>=3.12,<6.32",
   "psutil",
   "PyYAML>=6.0.1",
   "requests",
 
@@ -32,7 +32,7 @@ PyYAML>=6.0.1
 xgboost>=1.6.2,<=1.7.6
 pillow>=10.0.1,<=11
 opentelemetry-proto==1.27.0
-protobuf==4.25.5
+protobuf==4.25.8
 tensorboard>=2.16.2,<=2.18.0
 transformers==4.48.0
 sentencepiece==0.1.99
 
@@ -0,0 +1,27 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Amazon SageMaker Automatic Model Tuning Visualization module.
+
+This module provides visualization capabilities for SageMaker hyperparameter tuning jobs.
+It enables users to create interactive visualizations to analyze and understand the
+performance of hyperparameter optimization experiments.
+
+Example:
+    >>> from sagemaker.amtviz import visualize_tuning_job
+    >>> visualize_tuning_job('my-tuning-job')
+"""
+from __future__ import absolute_import
+
+from sagemaker.amtviz.visualization import visualize_tuning_job
+
+__all__ = ["visualize_tuning_job"]
@@ -0,0 +1,180 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Helper functions to retrieve job metrics from CloudWatch."""
+from __future__ import absolute_import
+
+from datetime import datetime, timedelta
+from typing import Callable, List, Optional, Tuple, Dict, Any
+import hashlib
+import os
+from pathlib import Path
+
+import logging
+import pandas as pd
+import numpy as np
+import boto3
+
+logger = logging.getLogger(__name__)
+
+cw = boto3.client("cloudwatch")
+sm = boto3.client("sagemaker")
+
+
+def disk_cache(outer: Callable) -> Callable:
+    """A decorator that implements disk-based caching for CloudWatch metrics data.
+
+    This decorator caches the output of the wrapped function to disk in JSON Lines format.
+    It creates a cache key using MD5 hash of the function arguments and stores the data
+    in the user's home directory under .amtviz/cw_metrics_cache/.
+
+    Args:
+        outer (Callable): The function to be wrapped. Must return a pandas DataFrame
+            containing CloudWatch metrics data.
+
+    Returns:
+        Callable: A wrapper function that implements the caching logic.
+    """
+
+    def inner(*args: Any, **kwargs: Any) -> pd.DataFrame:
+        key_input = str(args) + str(kwargs)
+        # nosec b303 - Not used for cryptography, but to create lookup key
+        key = hashlib.md5(key_input.encode("utf-8")).hexdigest()
+        cache_dir = Path.home().joinpath(".amtviz/cw_metrics_cache")
+        fn = f"{cache_dir}/req_{key}.jsonl.gz"
+        if Path(fn).exists():
+            try:
+                df = pd.read_json(fn, lines=True)
+                logger.debug("H", end="")
+                df["ts"] = pd.to_datetime(df["ts"])
+                df["ts"] = df["ts"].dt.tz_localize(None)
+                # pyright: ignore [reportIndexIssue, reportOptionalSubscript]
+                df["rel_ts"] = pd.to_datetime(df["rel_ts"])
+                df["rel_ts"] = df["rel_ts"].dt.tz_localize(None)
+                return df
+            except KeyError:
+                # Empty file leads to empty df, hence no df['ts'] possible
+                pass
+            # nosec b110 - doesn't matter why we could not load it.
+            except BaseException as e:
+                logger.error("\nException: %s - %s", type(e), e)
+
+        logger.debug("M", end="")
+        df = outer(*args, **kwargs)
+        assert isinstance(df, pd.DataFrame), "Only caching Pandas DataFrames."
+
+        os.makedirs(cache_dir, exist_ok=True)
+        df.to_json(fn, orient="records", date_format="iso", lines=True)
+
+        return df
+
+    return inner
+
+
+def _metric_data_query_tpl(metric_name: str, dim_name: str, dim_value: str) -> Dict[str, Any]:
+    """Returns a CloudWatch metric data query template."""
+    return {
+        "Id": metric_name.lower().replace(":", "_").replace("-", "_"),
+        "MetricStat": {
+            "Stat": "Average",
+            "Metric": {
+                "Namespace": "/aws/sagemaker/TrainingJobs",
+                "MetricName": metric_name,
+                "Dimensions": [
+                    {"Name": dim_name, "Value": dim_value},
+                ],
+            },
+            "Period": 60,
+        },
+        "ReturnData": True,
+    }
+
+
+def _get_metric_data(
+    queries: List[Dict[str, Any]], start_time: datetime, end_time: datetime
+) -> pd.DataFrame:
+    """Fetches CloudWatch metrics between timestamps, returns a DataFrame with selected columns."""
+    start_time = start_time - timedelta(hours=1)
+    end_time = end_time + timedelta(hours=1)
+    response = cw.get_metric_data(MetricDataQueries=queries, StartTime=start_time, EndTime=end_time)
+
+    df = pd.DataFrame()
+    if "MetricDataResults" not in response:
+        return df
+
+    for metric_data in response["MetricDataResults"]:
+        values = metric_data["Values"]
+        ts = np.array(metric_data["Timestamps"], dtype=np.datetime64)
+        labels = [metric_data["Label"]] * len(values)
+
+        df = pd.concat([df, pd.DataFrame({"value": values, "ts": ts, "label": labels})])
+
+    # We now calculate the relative time based on the first actual observed
+    # time stamps, not the potentially start time that we used to scope our CW
+    # API call. The difference could be for example startup times or waiting
+    # for Spot.
+    if not df.empty:
+        df["rel_ts"] = datetime.fromtimestamp(1) + (df["ts"] - df["ts"].min())  # pyright: ignore
+    return df
+
+
+@disk_cache
+def _collect_metrics(
+    dimensions: List[Tuple[str, str]], start_time: datetime, end_time: Optional[datetime]
+) -> pd.DataFrame:
+    """Collects SageMaker training job metrics from CloudWatch for dimensions and time range."""
+    df = pd.DataFrame()
+    for dim_name, dim_value in dimensions:
+        response = cw.list_metrics(
+            Namespace="/aws/sagemaker/TrainingJobs",
+            Dimensions=[
+                {"Name": dim_name, "Value": dim_value},
+            ],
+        )
+        if not response["Metrics"]:
+            continue
+        metric_names = [metric["MetricName"] for metric in response["Metrics"]]
+        if not metric_names:
+            # No metric data yet, or not any longer, because the data were aged out
+            continue
+        metric_data_queries = [
+            _metric_data_query_tpl(metric_name, dim_name, dim_value) for metric_name in metric_names
+        ]
+        df = pd.concat([df, _get_metric_data(metric_data_queries, start_time, end_time)])
+
+    return df
+
+
+def get_cw_job_metrics(
+    job_name: str, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None
+) -> pd.DataFrame:
+    """Retrieves CloudWatch metrics for a SageMaker training job.
+
+    Args:
+        job_name (str): Name of the SageMaker training job.
+        start_time (datetime, optional): Start time for metrics collection.
+            Defaults to now - 4 hours.
+        end_time (datetime, optional): End time for metrics collection.
+            Defaults to start_time + 4 hours.
+
+    Returns:
+        pd.DataFrame: Metrics data with columns for value, timestamp, and metric name.
+            Results are cached to disk for improved performance.
+    """
+    dimensions = [
+        ("TrainingJobName", job_name),
+        ("Host", job_name + "/algo-1"),
+    ]
+    # If not given, use reasonable defaults for start and end time
+    start_time = start_time or datetime.now() - timedelta(hours=4)
+    end_time = end_time or start_time + timedelta(hours=4)
+    return _collect_metrics(dimensions, start_time, end_time)