Skip to content
6 changes: 5 additions & 1 deletion src/sagemaker/modules/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from __future__ import absolute_import

from typing import Optional, Union
from typing import Optional, Union, List
from pydantic import BaseModel, model_validator, ConfigDict

import sagemaker_core.shapes as shapes
Expand Down Expand Up @@ -96,12 +96,16 @@ class SourceCode(BaseConfig):
command (Optional[str]):
The command(s) to execute in the training job container. Example: "python my_script.py".
If not specified, entry_script must be provided.
ignore_patterns: (Optional[List[str]]) :
The ignore patterns to ignore specific files/folders when uploading to S3. Example:
['.env', '.git', 'data', '__pycache__'].
"""

source_dir: Optional[str] = None
requirements: Optional[str] = None
entry_script: Optional[str] = None
command: Optional[str] = None
ignore_patterns: Optional[List[str]] = None


class Compute(shapes.ResourceConfig):
Expand Down
37 changes: 30 additions & 7 deletions src/sagemaker/modules/train/model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ class ModelTrainer(BaseModel):
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import SourceCode, Compute, InputData

source_code = SourceCode(source_dir="source", entry_script="train.py")
ignore_patterns = ['.env', '.git', 'data', '__pycache__']
source_code = SourceCode(source_dir="source", entry_script="train.py", ignore_patterns=ignore_patterns)
training_image = "123456789012.dkr.ecr.us-west-2.amazonaws.com/my-training-image"
model_trainer = ModelTrainer(
training_image=training_image,
Expand Down Expand Up @@ -654,6 +655,7 @@ def train(
channel_name=SM_CODE,
data_source=self.source_code.source_dir,
key_prefix=input_data_key_prefix,
ignore_patterns=self.source_code.ignore_patterns,
)
final_input_data_config.append(source_code_channel)

Expand Down Expand Up @@ -755,7 +757,11 @@ def train(
local_container.train(wait)

def create_input_data_channel(
self, channel_name: str, data_source: DataSourceType, key_prefix: Optional[str] = None
self,
channel_name: str,
data_source: DataSourceType,
key_prefix: Optional[str] = None,
ignore_patterns: Optional[List[str]] = None,
) -> Channel:
"""Create an input data channel for the training job.

Expand All @@ -771,6 +777,9 @@ def create_input_data_channel(

If specified, local data will be uploaded to:
``s3://<default_bucket_path>/<key_prefix>/<channel_name>/``
ignore_patterns: (Optional[List[str]]) :
The ignore patterns to ignore specific files/folders when uploading to S3.
Example: ['.env', '.git', 'data', '__pycache__'].
"""
channel = None
if isinstance(data_source, str):
Expand Down Expand Up @@ -810,11 +819,25 @@ def create_input_data_channel(
)
if self.sagemaker_session.default_bucket_prefix:
key_prefix = f"{self.sagemaker_session.default_bucket_prefix}/{key_prefix}"
s3_uri = self.sagemaker_session.upload_data(
path=data_source,
bucket=self.sagemaker_session.default_bucket(),
key_prefix=key_prefix,
)
if ignore_patterns:
tmp_dir = TemporaryDirectory()
shutil.copytree(
data_source,
os.path.join(tmp_dir.name, os.path.basename(data_source)),
dirs_exist_ok=True,
ignore=shutil.ignore_patterns(*ignore_patterns),
)
s3_uri = self.sagemaker_session.upload_data(
path=tmp_dir.name,
bucket=self.sagemaker_session.default_bucket(),
key_prefix=key_prefix,
)
else:
s3_uri = self.sagemaker_session.upload_data(
path=data_source,
bucket=self.sagemaker_session.default_bucket(),
key_prefix=key_prefix,
)
channel = Channel(
channel_name=channel_name,
data_source=DataSource(
Expand Down