Skip to content

Commit 5fad197

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 1931e3b + 81b4bfe commit 5fad197

File tree

3 files changed

+160
-4
lines changed

3 files changed

+160
-4
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
## Examples
44

55
* Fine-Tune LLMs with Ray and DeepSpeed on OpenShift AI
6+
* Fine-Tune Stable Diffusion with DreamBooth and Ray Train
7+
* Hyperparameters Optimization with Ray Tune on OpenShift AI
68

79
## Integration Tests
810

instructlab/standalone/README.md

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,144 @@ of models without relying on centralized orchestration tools like KubeFlow.
99
The `standalone.py` tool provides support for fetching generated SDG (Synthetic Data Generation) data from an AWS S3 compatible object store.
1010
While AWS S3 is supported, alternative object storage solutions such as Ceph, Nooba, and MinIO are also compatible.
1111

12+
## Overall end-to-end workflow
13+
14+
```text
15+
+-------------------------------+
16+
| Kubernetes Job |
17+
| "data-download" |
18+
+-------------------------------+
19+
| Init Container |
20+
| "download-data-object-store" |
21+
| (Fetches data from object |
22+
| storage) |
23+
+-------------------------------+
24+
| Main Container |
25+
| "sdg-data-preprocess" |
26+
| (Processes the downloaded |
27+
| data) |
28+
+-------------------------------+
29+
|
30+
v
31+
+-------------------------------+
32+
| "watch for completion" |
33+
+-------------------------------+
34+
|
35+
v
36+
+-----------------------------------+
37+
| PytorchJob CR training phase 1 |
38+
| |
39+
| +---------------------+ |
40+
| | Master Pod | |
41+
| | (Trains and | |
42+
| | Coordinates the | |
43+
| | distributed | |
44+
| | training) | |
45+
| +---------------------+ |
46+
| | |
47+
| v |
48+
| +---------------------+ |
49+
| | Worker Pod 1 | |
50+
| | (Handles part of | |
51+
| | the training) | |
52+
| +---------------------+ |
53+
| | |
54+
| v |
55+
| +---------------------+ |
56+
| | Worker Pod 2 | |
57+
| | (Handles part of | |
58+
| | the training) | |
59+
| +---------------------+ |
60+
+-----------------------------------+
61+
|
62+
v
63+
+-------------------------------+
64+
| "wait for completion" |
65+
+-------------------------------+
66+
|
67+
v
68+
+-----------------------------------+
69+
| PytorchJob CR training phase 2 |
70+
| |
71+
| +---------------------+ |
72+
| | Master Pod | |
73+
| | (Trains and | |
74+
| | Coordinates the | |
75+
| | distributed | |
76+
| | training) | |
77+
| +---------------------+ |
78+
| | |
79+
| v |
80+
| +---------------------+ |
81+
| | Worker Pod 1 | |
82+
| | (Handles part of | |
83+
| | the training) | |
84+
| +---------------------+ |
85+
| | |
86+
| v |
87+
| +---------------------+ |
88+
| | Worker Pod 2 | |
89+
| | (Handles part of | |
90+
| | the training) | |
91+
| +---------------------+ |
92+
+-----------------------------------+
93+
|
94+
v
95+
+-------------------------------+
96+
| "wait for completion" |
97+
+-------------------------------+
98+
|
99+
v
100+
+-------------------------------+
101+
| Kubernetes Job |
102+
| "eval-mt-bench" |
103+
+-------------------------------+
104+
| Init Container |
105+
| "run-eval-mt-bench" |
106+
| (Runs evaluation on MT Bench)|
107+
+-------------------------------+
108+
| Main Container |
109+
| "output-eval-mt-bench-scores"|
110+
| (Outputs evaluation scores) |
111+
+-------------------------------+
112+
|
113+
v
114+
+-------------------------------+
115+
| "wait for completion" |
116+
+-------------------------------+
117+
|
118+
v
119+
+-------------------------------+
120+
| Kubernetes Job |
121+
| "eval-final" |
122+
+-------------------------------+
123+
| Init Container |
124+
| "run-eval-final" |
125+
| (Runs final evaluation) |
126+
+-------------------------------+
127+
| Main Container |
128+
| "output-eval-final-scores" |
129+
| (Outputs final evaluation |
130+
| scores) |
131+
+-------------------------------+
132+
|
133+
v
134+
+-------------------------------+
135+
| "wait for completion" |
136+
+-------------------------------+
137+
|
138+
v
139+
+-------------------------------+
140+
| Kubernetes Job |
141+
| "trained-model-upload" |
142+
+-------------------------------+
143+
| Main Container |
144+
| "upload-data-object-store" |
145+
| (Uploads the trained model to|
146+
| the object storage) |
147+
+-------------------------------+
148+
```
149+
12150
## Requirements
13151

14152
The `standalone.py` script is designed to run within a Kubernetes environment. The following requirements must be met:

instructlab/standalone/standalone.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1550,6 +1550,7 @@ def data_processing(train_args: TrainingArgs) -> None:
15501550
def create_eval_job(
15511551
namespace: str,
15521552
eval_type: str,
1553+
judge_serving_model_secret: str,
15531554
nproc_per_node: int = 1,
15541555
) -> kubernetes.client.V1Job:
15551556
"""
@@ -1560,6 +1561,7 @@ def create_eval_job(
15601561
Args:
15611562
namespace (str): The namespace in which the job will be created.
15621563
eval_type (str): The type of evaluation to run.
1564+
judge_serving_model_secret (str): The name of the Kubernetes Secret containing the judge
15631565
nproc_per_node (int): The number of processes per node.
15641566
15651567
Returns:
@@ -1729,7 +1731,7 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
17291731
max_workers = usable_cpu_count
17301732
17311733
# modify model_list to ignore any jsonl files present in the directory
1732-
models_list = [model for model in models_list if model.endswith(".jsonl") != True]
1734+
models_list = [model for model in models_list if not model.endswith(".jsonl")]
17331735
for model_name in models_list:
17341736
print(f"Serving candidate model: {model_name}")
17351737
model_path = f"{models_path_prefix}/{model_name}"
@@ -2275,7 +2277,7 @@ def find_node_dataset_directories(base_dir: str):
22752277
env_from=[
22762278
kubernetes.client.V1EnvFromSource(
22772279
secret_ref=kubernetes.client.V1SecretEnvSource(
2278-
name=JUDGE_SERVING_NAME
2280+
name=judge_serving_model_secret
22792281
)
22802282
),
22812283
],
@@ -2310,7 +2312,7 @@ def find_node_dataset_directories(base_dir: str):
23102312
env_from=[
23112313
kubernetes.client.V1EnvFromSource(
23122314
secret_ref=kubernetes.client.V1SecretEnvSource(
2313-
name=JUDGE_SERVING_NAME
2315+
name=judge_serving_model_secret
23142316
)
23152317
),
23162318
],
@@ -2854,6 +2856,9 @@ def decode_base64(data):
28542856
f"Secret {judge_serving_model_secret} not found in namespace {namespace}."
28552857
) from exc
28562858

2859+
# Set the judge secret in the context for the evaluation job
2860+
ctx.obj["judge_serving_model_secret"] = judge_serving_model_secret
2861+
28572862
# list of PVCs to create and their details
28582863
pvcs = [
28592864
{
@@ -3112,6 +3117,13 @@ def evaluation(ctx: click.Context) -> str:
31123117
namespace = ctx.obj["namespace"]
31133118
eval_type = ctx.obj["eval_type"]
31143119
dry_run = ctx.obj["dry_run"]
3120+
judge_serving_model_secret = ctx.obj["judge_serving_model_secret"]
3121+
3122+
# This should only happen if the script is called with the "evaluation" subcommand
3123+
if not judge_serving_model_secret:
3124+
raise ValueError(
3125+
"Judge serving model secret must be provided with --judge-serving-model-secret."
3126+
)
31153127

31163128
if eval_type is None:
31173129
raise ValueError(
@@ -3121,7 +3133,11 @@ def evaluation(ctx: click.Context) -> str:
31213133
logger.info("Running %s evaluation.", eval_type)
31223134

31233135
# Create and run the evaluation job
3124-
job = create_eval_job(namespace=namespace, eval_type=eval_type)
3136+
job = create_eval_job(
3137+
namespace=namespace,
3138+
eval_type=eval_type,
3139+
judge_serving_model_secret=judge_serving_model_secret,
3140+
)
31253141

31263142
if dry_run:
31273143
logger.info("Dry run: Job would be created.\n%s", job)

0 commit comments

Comments
 (0)