Skip to content

Commit 8034a24

Browse files
rsareddy0329Roja Reddy Sareddy
andauthored
Training: Main documentation update (#153)
* Training CLI & SDK: example notebook and README update * Update training cli example notebook --------- Co-authored-by: Roja Reddy Sareddy <[email protected]>
1 parent 9fbec4a commit 8034a24

File tree

2 files changed

+31
-10
lines changed

2 files changed

+31
-10
lines changed

README.md

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,8 @@ hyp create hyp-pytorch-job \
158158
--version 1.0 \
159159
--job-name test-pytorch-job \
160160
--image pytorch/pytorch:latest \
161-
--command '["python", "train.py"]' \
162-
--args '["--epochs", "10", "--batch-size", "32"]' \
161+
--command '[python, train.py]' \
162+
--args '[--epochs=10, --batch-size=32]' \
163163
--environment '{"PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:32"}' \
164164
--pull-policy "IfNotPresent" \
165165
--instance-type ml.p4d.24xlarge \
@@ -170,8 +170,8 @@ hyp create hyp-pytorch-job \
170170
--queue-name "training-queue" \
171171
--priority "high" \
172172
--max-retry 3 \
173-
--volumes '["data-vol", "model-vol", "checkpoint-vol"]' \
174-
--persistent-volume-claims '["shared-data-pvc", "model-registry-pvc"]' \
173+
--volumes '[data-vol, model-vol, checkpoint-vol]' \
174+
--persistent-volume-claims '[shared-data-pvc, model-registry-pvc]' \
175175
--output-s3-uri s3://my-bucket/model-artifacts
176176
```
177177
@@ -257,9 +257,10 @@ Along with the CLI, we also have SDKs available that can perform the training an
257257
258258
```
259259

260-
from sagemaker.hyperpod import HyperPodPytorchJob
261-
from sagemaker.hyperpod.job
262-
import ReplicaSpec, Template, Spec, Container, Resources, RunPolicy, Metadata
260+
from sagemaker.hyperpod.training import HyperPodPytorchJob
261+
from sagemaker.hyperpod.training
262+
import ReplicaSpec, Template, Spec, Containers, Resources, RunPolicy
263+
from sagemaker.hyperpod.common.config import Metadata
263264

264265
# Define job specifications
265266
nproc_per_node = "1" # Number of processes per node
@@ -274,7 +275,7 @@ replica_specs =
274275
(
275276
containers =
276277
[
277-
Container
278+
Containers
278279
(
279280
# Container name
280281
name="container-name",

examples/training/CLI/training-e2e-cli.ipynb

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,31 @@
1717
]
1818
},
1919
{
20+
"metadata": {},
21+
"cell_type": "code",
22+
"outputs": [],
23+
"execution_count": null,
24+
"source": "!hyp list-cluster --output table",
25+
"id": "9df747dbfa211453"
26+
},
27+
{
28+
"metadata": {},
2029
"cell_type": "code",
30+
"outputs": [],
2131
"execution_count": null,
22-
"id": "b30debba",
32+
"source": "!hyp set-cluster-context --cluster-name <cluster-name>",
33+
"id": "8db986d2b42a9e88"
34+
},
35+
{
2336
"metadata": {},
37+
"cell_type": "code",
2438
"outputs": [],
25-
"source": "!hyperpod get-clusters"
39+
"execution_count": null,
40+
"source": [
41+
"#verify the cluster context\n",
42+
"!hyp get-cluster-context "
43+
],
44+
"id": "ba996d7dc8e128d5"
2645
},
2746
{
2847
"metadata": {
@@ -46,6 +65,7 @@
4665
"metadata": {},
4766
"outputs": [],
4867
"source": [
68+
"#example command\n",
4969
"!hyp create hyp-pytorch-job \\\n",
5070
" --version 1.0 \\\n",
5171
" --job-name test-pytorch-job-cli \\\n",

0 commit comments

Comments
 (0)