4242import sagemaker .utils
4343
4444CONTAINER_PREFIX = "algo"
45+ STUDIO_HOST_NAME = "sagemaker-local"
4546DOCKER_COMPOSE_FILENAME = "docker-compose.yaml"
4647DOCKER_COMPOSE_HTTP_TIMEOUT_ENV = "COMPOSE_HTTP_TIMEOUT"
4748DOCKER_COMPOSE_HTTP_TIMEOUT = "120"
5051REGION_ENV_NAME = "AWS_REGION"
5152TRAINING_JOB_NAME_ENV_NAME = "TRAINING_JOB_NAME"
5253S3_ENDPOINT_URL_ENV_NAME = "S3_ENDPOINT_URL"
54+ SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE"
5355
5456# SELinux Enabled
5557SELINUX_ENABLED = os .environ .get ("SAGEMAKER_LOCAL_SELINUX_ENABLED" , "False" ).lower () in [
@@ -107,10 +109,30 @@ def __init__(
107109 # Since we are using a single docker network, Generate a random suffix to attach to the
108110 # container names. This way multiple jobs can run in parallel.
109111 suffix = "" .join (random .choice (string .ascii_lowercase + string .digits ) for _ in range (5 ))
110- self .hosts = [
111- "{}-{}-{}" .format (CONTAINER_PREFIX , i , suffix )
112- for i in range (1 , self .instance_count + 1 )
113- ]
112+ self .is_studio = sagemaker .local .utils .check_for_studio ()
113+ if self .is_studio :
114+ if self .instance_count > 1 :
115+ raise NotImplementedError (
116+ "Multi instance Local Mode execution is "
117+ "currently not supported in SageMaker Studio."
118+ )
119+ # For studio use-case, directories need to be created in `~/tmp`, rather than /tmp
120+ home = os .path .expanduser ("~" )
121+ root_dir = os .path .join (home , "tmp" )
122+ if not os .path .isdir (root_dir ):
123+ os .mkdir (root_dir )
124+ if self .sagemaker_session .config :
125+ self .sagemaker_session .config ["local" ]["container_root" ] = root_dir
126+ else :
127+ self .sagemaker_session .config = {"local" : {"container_root" : root_dir }}
128+ # Studio only supports single instance run
129+ self .hosts = [STUDIO_HOST_NAME ]
130+ else :
131+ self .hosts = [
132+ "{}-{}-{}" .format (CONTAINER_PREFIX , i , suffix )
133+ for i in range (1 , self .instance_count + 1 )
134+ ]
135+
114136 self .container_root = None
115137 self .container = None
116138
@@ -201,22 +223,17 @@ def process(
201223 self ._generate_compose_file (
202224 "process" , additional_volumes = volumes , additional_env_vars = environment
203225 )
204- compose_command = self ._compose ()
205226
206227 if _ecr_login_if_needed (self .sagemaker_session .boto_session , self .image ):
207228 _pull_image (self .image )
208229
230+ compose_command = self ._compose ()
209231 process = subprocess .Popen (
210232 compose_command , stdout = subprocess .PIPE , stderr = subprocess .STDOUT
211233 )
212234
213235 try :
214236 _stream_output (process )
215- except RuntimeError as e :
216- # _stream_output() doesn't have the command line. We will handle the exception
217- # which contains the exit code and append the command line to it.
218- msg = f"Failed to run: { compose_command } "
219- raise RuntimeError (msg ) from e
220237 finally :
221238 # Uploading processing outputs back to Amazon S3.
222239 self ._upload_processing_outputs (data_dir , processing_output_config )
@@ -283,22 +300,17 @@ def train(self, input_data_config, output_data_config, hyperparameters, environm
283300 compose_data = self ._generate_compose_file (
284301 "train" , additional_volumes = volumes , additional_env_vars = training_env_vars
285302 )
286- compose_command = self ._compose ()
287303
288304 if _ecr_login_if_needed (self .sagemaker_session .boto_session , self .image ):
289305 _pull_image (self .image )
290306
307+ compose_command = self ._compose ()
291308 process = subprocess .Popen (
292309 compose_command , stdout = subprocess .PIPE , stderr = subprocess .STDOUT
293310 )
294311
295312 try :
296313 _stream_output (process )
297- except RuntimeError as e :
298- # _stream_output() doesn't have the command line. We will handle the exception
299- # which contains the exit code and append the command line to it.
300- msg = "Failed to run: %s, %s" % (compose_command , str (e ))
301- raise RuntimeError (msg )
302314 finally :
303315 artifacts = self .retrieve_artifacts (compose_data , output_data_config , job_name )
304316
@@ -347,6 +359,7 @@ def serve(self, model_dir, environment):
347359 self ._generate_compose_file (
348360 "serve" , additional_env_vars = environment , additional_volumes = volumes
349361 )
362+
350363 compose_command = self ._compose ()
351364
352365 self .container = _HostingContainer (compose_command )
@@ -710,6 +723,9 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
710723 additional_env_var_list = ["{}={}" .format (k , v ) for k , v in additional_env_vars .items ()]
711724 environment .extend (additional_env_var_list )
712725
726+ if self .is_studio :
727+ environment .extend ([f"{ SM_STUDIO_LOCAL_MODE } =True" ])
728+
713729 if os .environ .get (DOCKER_COMPOSE_HTTP_TIMEOUT_ENV ) is None :
714730 os .environ [DOCKER_COMPOSE_HTTP_TIMEOUT_ENV ] = DOCKER_COMPOSE_HTTP_TIMEOUT
715731
@@ -723,12 +739,19 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
723739 for h in self .hosts
724740 }
725741
726- content = {
727- # Use version 2.3 as a minimum so that we can specify the runtime
728- "version" : "2.3" ,
729- "services" : services ,
730- "networks" : {"sagemaker-local" : {"name" : "sagemaker-local" }},
731- }
742+ if self .is_studio :
743+ content = {
744+ # Use version 2.3 as a minimum so that we can specify the runtime
745+ "version" : "2.3" ,
746+ "services" : services ,
747+ }
748+ else :
749+ content = {
750+ # Use version 2.3 as a minimum so that we can specify the runtime
751+ "version" : "2.3" ,
752+ "services" : services ,
753+ "networks" : {"sagemaker-local" : {"name" : "sagemaker-local" }},
754+ }
732755
733756 docker_compose_path = os .path .join (self .container_root , DOCKER_COMPOSE_FILENAME )
734757
@@ -810,7 +833,6 @@ def _create_docker_host(
810833 "tty" : True ,
811834 "volumes" : [v .map for v in optml_volumes ],
812835 "environment" : environment ,
813- "networks" : {"sagemaker-local" : {"aliases" : [host ]}},
814836 }
815837
816838 is_train_with_entrypoint = False
@@ -827,14 +849,19 @@ def _create_docker_host(
827849 if self .container_arguments :
828850 host_config ["entrypoint" ] = host_config ["entrypoint" ] + self .container_arguments
829851
852+ if self .is_studio :
853+ host_config ["network_mode" ] = "sagemaker"
854+ else :
855+ host_config ["networks" ] = {"sagemaker-local" : {"aliases" : [host ]}}
856+
830857 # for GPU support pass in nvidia as the runtime, this is equivalent
831858 # to setting --runtime=nvidia in the docker commandline.
832859 if self .instance_type == "local_gpu" :
833860 host_config ["deploy" ] = {
834861 "resources" : {"reservations" : {"devices" : [{"capabilities" : ["gpu" ]}]}}
835862 }
836863
837- if command == "serve" :
864+ if not self . is_studio and command == "serve" :
838865 serving_port = (
839866 sagemaker .utils .get_config_value (
840867 "local.serving_port" , self .sagemaker_session .config
@@ -910,7 +937,7 @@ def __init__(self, command):
910937 """Creates a new threaded hosting container.
911938
912939 Args:
913- command:
940+ command (dict): docker compose command
914941 """
915942 Thread .__init__ (self )
916943 self .command = command
@@ -987,8 +1014,8 @@ def _stream_output(process):
9871014 sys .stdout .write (stdout )
9881015 exit_code = process .poll ()
9891016
990- if exit_code != 0 :
991- raise RuntimeError (" Process exited with code: %s" % exit_code )
1017+ if exit_code not in [ 0 , 130 ] :
1018+ raise RuntimeError (f"Failed to run: { process . args } . Process exited with code: { exit_code } " )
9921019
9931020 return exit_code
9941021
0 commit comments