@@ -18,13 +18,16 @@ Scheduling:
1818 Scheduler : slurm
1919 SlurmQueues :
2020 - Name : queue-trn32
21+ CapacityType : CAPACITY_BLOCK
2122 ComputeResources :
2223 - Name : compute-resource-trn32
23- Instances :
24- - InstanceType : {{instance}}
24+ InstanceType : {{instance}}
2525 MinCount : 2
26+ MaxCount : 2
2627 Efa :
2728 Enabled : true
29+ CapacityReservationTarget :
30+ CapacityReservationId : cr-05b0c099ce2534ce3
2831 Networking :
2932 SubnetIds :
3033 - {{ private_subnet_id }}
@@ -42,24 +45,24 @@ Scheduling:
4245 - BucketName : {{ bucket_name }}
4346 # Needed to download neuronx packages and neff file --> FIXME to be removed once packages are public available
4447 - BucketName : aws-parallelcluster-beta
45- - Name : queue-trn2
46- ComputeResources :
47- - Name : compute-resource-trn2
48- Instances :
49- - InstanceType : trn1.2xlarge
50- MinCount : 0 # TODO change to 1 once allreduce test is passing
51- Networking :
52- SubnetIds :
53- - {{ private_subnet_id }}
54- CustomActions :
55- OnNodeConfigured :
56- Script : s3://{{ bucket_name }}/neuron-installation.sh
57- Iam :
58- # Policy to access to Trainium beta repository info
59- AdditionalIamPolicies :
60- - Policy : arn:aws:iam::447714826191:policy/TrainiumPreviewPolicy
61- S3Access :
62- # Needed to download post install script
63- - BucketName : {{ bucket_name }}
64- # Needed to download neuronx packages and neff file --> FIXME to be removed once packages are public available
65- - BucketName : aws-parallelcluster-beta
48+ # - Name: queue-trn2
49+ # ComputeResources:
50+ # - Name: compute-resource-trn2
51+ # Instances:
52+ # - InstanceType: trn1.2xlarge
53+ # MinCount: 0 # TODO change to 1 once allreduce test is passing
54+ # Networking:
55+ # SubnetIds:
56+ # - {{ private_subnet_id }}
57+ # CustomActions:
58+ # OnNodeConfigured:
59+ # Script: s3://{{ bucket_name }}/neuron-installation.sh
60+ # Iam:
61+ # # Policy to access to Trainium beta repository info
62+ # AdditionalIamPolicies:
63+ # - Policy: arn:aws:iam::447714826191:policy/TrainiumPreviewPolicy
64+ # S3Access:
65+ # # Needed to download post install script
66+ # - BucketName: {{ bucket_name }}
67+ # # Needed to download neuronx packages and neff file --> FIXME to be removed once packages are public available
68+ # - BucketName: aws-parallelcluster-beta
0 commit comments