Merge remote-tracking branch 'upstream/main'

dchourasia · dchourasia · commit 5fad1974c540 · 2024-10-23T00:18:03.000Z
diff --git a/README.md b/README.md
@@ -3,6 +3,8 @@
 ## Examples
 
 * Fine-Tune LLMs with Ray and DeepSpeed on OpenShift AI
+* Fine-Tune Stable Diffusion with DreamBooth and Ray Train
+* Hyperparameters Optimization with Ray Tune on OpenShift AI
 
 ## Integration Tests
 
diff --git a/instructlab/standalone/README.md b/instructlab/standalone/README.md
@@ -9,6 +9,144 @@ of models without relying on centralized orchestration tools like KubeFlow.
 The `standalone.py` tool provides support for fetching generated SDG (Synthetic Data Generation) data from an AWS S3 compatible object store.
 While AWS S3 is supported, alternative object storage solutions such as Ceph, Nooba, and MinIO are also compatible.
 
+## Overall end-to-end workflow
+
+```text
++-------------------------------+
+|       Kubernetes Job          |
+|         "data-download"       |
++-------------------------------+
+|      Init Container           |
+| "download-data-object-store"  |
+|  (Fetches data from object    |
+|        storage)               |
++-------------------------------+
+|        Main Container         |
+|  "sdg-data-preprocess"        |
+| (Processes the downloaded     |
+|         data)                 |
++-------------------------------+
+              |
+              v
++-------------------------------+
+|   "watch for completion"      |
++-------------------------------+
+              |
+              v
++-----------------------------------+
+|   PytorchJob CR training phase 1  |
+|                                   |
+|       +---------------------+     |
+|       |    Master Pod       |     |
+|       | (Trains and         |     |
+|       |  Coordinates the    |     |
+|       |   distributed       |     |
+|       |   training)         |     |
+|       +---------------------+     |
+|                |                  |
+|                v                  |
+|       +---------------------+     |
+|       |    Worker Pod 1     |     |
+|       |  (Handles part of   |     |
+|       |   the training)     |     |
+|       +---------------------+     |
+|                |                  |
+|                v                  |
+|       +---------------------+     |
+|       |    Worker Pod 2     |     |
+|       |  (Handles part of   |     |
+|       |   the training)     |     |
+|       +---------------------+     |
++-----------------------------------+
+              |
+              v
++-------------------------------+
+|   "wait for completion"       |
++-------------------------------+
+              |
+              v
++-----------------------------------+
+|   PytorchJob CR training phase 2  |
+|                                   |
+|       +---------------------+     |
+|       |    Master Pod       |     |
+|       | (Trains and         |     |
+|       |  Coordinates the    |     |
+|       |   distributed       |     |
+|       |   training)         |     |
+|       +---------------------+     |
+|                |                  |
+|                v                  |
+|       +---------------------+     |
+|       |    Worker Pod 1     |     |
+|       |  (Handles part of   |     |
+|       |   the training)     |     |
+|       +---------------------+     |
+|                |                  |
+|                v                  |
+|       +---------------------+     |
+|       |    Worker Pod 2     |     |
+|       |  (Handles part of   |     |
+|       |   the training)     |     |
+|       +---------------------+     |
++-----------------------------------+
+              |
+              v
++-------------------------------+
+|   "wait for completion"       |
++-------------------------------+
+              |
+              v
++-------------------------------+
+|       Kubernetes Job          |
+|         "eval-mt-bench"       |
++-------------------------------+
+|      Init Container           |
+|     "run-eval-mt-bench"       |
+|  (Runs evaluation on MT Bench)|
++-------------------------------+
+|        Main Container         |
+|  "output-eval-mt-bench-scores"|
+| (Outputs evaluation scores)   |
++-------------------------------+
+              |
+              v
++-------------------------------+
+|   "wait for completion"       |
++-------------------------------+
+              |
+              v
++-------------------------------+
+|       Kubernetes Job          |
+|          "eval-final"         |
++-------------------------------+
+|      Init Container           |
+|       "run-eval-final"        |
+|  (Runs final evaluation)      |
++-------------------------------+
+|        Main Container         |
+|  "output-eval-final-scores"   |
+|  (Outputs final evaluation    |
+|          scores)              |
++-------------------------------+
+              |
+              v
++-------------------------------+
+|   "wait for completion"       |
++-------------------------------+
+              |
+              v
++-------------------------------+
+|       Kubernetes Job          |
+|      "trained-model-upload"   |
++-------------------------------+
+|        Main Container         |
+|  "upload-data-object-store"   |
+|  (Uploads the trained model to|
+|     the object storage)       |
++-------------------------------+
+```
+
 ## Requirements
 
 The `standalone.py` script is designed to run within a Kubernetes environment. The following requirements must be met:
diff --git a/instructlab/standalone/standalone.py b/instructlab/standalone/standalone.py
@@ -1550,6 +1550,7 @@ def data_processing(train_args: TrainingArgs) -> None:
 def create_eval_job(
     namespace: str,
     eval_type: str,
+    judge_serving_model_secret: str,
     nproc_per_node: int = 1,
 ) -> kubernetes.client.V1Job:
     """
@@ -1560,6 +1561,7 @@ def create_eval_job(
     Args:
         namespace (str): The namespace in which the job will be created.
         eval_type (str): The type of evaluation to run.
+        judge_serving_model_secret (str): The name of the Kubernetes Secret containing the judge
         nproc_per_node (int): The number of processes per node.
 
     Returns:
@@ -1729,7 +1731,7 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
         max_workers = usable_cpu_count
 
     # modify model_list to ignore any jsonl files present in the directory
-    models_list = [model for model in models_list if model.endswith(".jsonl") != True]
+    models_list = [model for model in models_list if not model.endswith(".jsonl")]
     for model_name in models_list:
         print(f"Serving candidate model: {model_name}")
         model_path = f"{models_path_prefix}/{model_name}"
@@ -2275,7 +2277,7 @@ def find_node_dataset_directories(base_dir: str):
                 env_from=[
                     kubernetes.client.V1EnvFromSource(
                         secret_ref=kubernetes.client.V1SecretEnvSource(
-                            name=JUDGE_SERVING_NAME
+                            name=judge_serving_model_secret
                         )
                     ),
                 ],
@@ -2310,7 +2312,7 @@ def find_node_dataset_directories(base_dir: str):
                 env_from=[
                     kubernetes.client.V1EnvFromSource(
                         secret_ref=kubernetes.client.V1SecretEnvSource(
-                            name=JUDGE_SERVING_NAME
+                            name=judge_serving_model_secret
                         )
                     ),
                 ],
@@ -2854,6 +2856,9 @@ def decode_base64(data):
                         f"Secret {judge_serving_model_secret} not found in namespace {namespace}."
                     ) from exc
 
+    # Set the judge secret in the context for the evaluation job
+    ctx.obj["judge_serving_model_secret"] = judge_serving_model_secret
+
     # list of PVCs to create and their details
     pvcs = [
         {
@@ -3112,6 +3117,13 @@ def evaluation(ctx: click.Context) -> str:
     namespace = ctx.obj["namespace"]
     eval_type = ctx.obj["eval_type"]
     dry_run = ctx.obj["dry_run"]
+    judge_serving_model_secret = ctx.obj["judge_serving_model_secret"]
+
+    # This should only happen if the script is called with the "evaluation" subcommand
+    if not judge_serving_model_secret:
+        raise ValueError(
+            "Judge serving model secret must be provided with --judge-serving-model-secret."
+        )
 
     if eval_type is None:
         raise ValueError(
@@ -3121,7 +3133,11 @@ def evaluation(ctx: click.Context) -> str:
     logger.info("Running %s evaluation.", eval_type)
 
     # Create and run the evaluation job
-    job = create_eval_job(namespace=namespace, eval_type=eval_type)
+    job = create_eval_job(
+        namespace=namespace,
+        eval_type=eval_type,
+        judge_serving_model_secret=judge_serving_model_secret,
+    )
 
     if dry_run:
         logger.info("Dry run: Job would be created.\n%s", job)