red-hat-data-services
diff --git a/‎tests/trainer/kubeflow_sdk_test.go‎
Lines changed: 7 additions & 0 deletions b/‎tests/trainer/kubeflow_sdk_test.go‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎tests/trainer/resources/osft.ipynb‎
Lines changed: 286 additions & 0 deletions b/‎tests/trainer/resources/osft.ipynb‎
Lines changed: 286 additions & 0 deletions
@@ -20,10 +20,17 @@ import (
 	"testing"
 
 	. "github.com/opendatahub-io/distributed-workloads/tests/common"
+	support "github.com/opendatahub-io/distributed-workloads/tests/common/support"
 	sdktests "github.com/opendatahub-io/distributed-workloads/tests/trainer/sdk_tests"
 )
 
 func TestKubeflowSdkSanity(t *testing.T) {
 	Tags(t, Sanity)
 	sdktests.RunFashionMnistCpuDistributedTraining(t)
 }
+
+// TestOsftTrainingHubMultiNodeMultiGPU tests OSFT training using TrainingHubTrainer
+func TestOsftTrainingHubMultiNodeMultiGPU(t *testing.T) {
+	Tags(t, KftoCuda, MultiNodeMultiGpu(2, support.NVIDIA, 1)) // TODO: may need to be updated once https://issues.redhat.com/browse/RHOAIENG-30719 and https://issues.redhat.com/browse/RHOAIENG-24552 are resolved
+	sdktests.RunOsftTrainingHubMultiGpuDistributedTraining(t)
+}
@@ -0,0 +1,286 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Standard library imports\n",
+    "import logging\n",
+    "import os\n",
+    "import sys\n",
+    "import time\n",
+    "from io import StringIO"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from kubernetes import client as k8s, config as k8s_config\n",
+    "# Edit to match your specific settings\n",
+    "api_server = os.getenv(\"OPENSHIFT_API_URL\")\n",
+    "token = os.getenv(\"NOTEBOOK_USER_TOKEN\")\n",
+    "PVC_NAME = os.getenv(\"SHARED_PVC_NAME\", \"shared\")\n",
+    "\n",
+    "configuration = k8s.Configuration()\n",
+    "configuration.host = api_server\n",
+    "# Un-comment if your cluster API server uses a self-signed certificate or an un-trusted CA\n",
+    "configuration.verify_ssl = False\n",
+    "configuration.api_key = {\"authorization\": f\"Bearer {token}\"}\n",
+    "api_client = k8s.ApiClient(configuration)\n",
+    "\n",
+    "PVC_MOUNT_PATH = \"/opt/app-root/src\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import random\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "# Load the Table-GPT dataset\n",
+    "print(\"Loading Table-GPT dataset...\")\n",
+    "dataset = load_dataset(\"LipengCS/Table-GPT\", \"All\")\n",
+    "\n",
+    "# Get the training split and create a random subset of 100 samples\n",
+    "train_data = dataset[\"train\"]\n",
+    "print(f\"Original training set size: {len(train_data)}\")\n",
+    "\n",
+    "# Create a random subset of 100 samples\n",
+    "random.seed(42)  # For reproducibility\n",
+    "subset_indices = random.sample(range(len(train_data)), min(100, len(train_data)))\n",
+    "subset_data = train_data.select(subset_indices)\n",
+    "\n",
+    "print(f\"Subset size: {len(subset_data)}\")\n",
+    "\n",
+    "# Save the subset to a JSONL file\n",
+    "# Save the subset to a JSONL file - USE ABSOLUTE PATH\n",
+    "output_dir = \"table-gpt-data/train\"\n",
+    "output_file = f\"{output_dir}/train_All_100.jsonl\"\n",
+    "\n",
+    "print(f\"Creating directory: {output_dir}\")\n",
+    "os.makedirs(output_dir, exist_ok=True)\n",
+    "\n",
+    "with open(output_file, \"w\") as f:\n",
+    "    for example in subset_data:\n",
+    "        f.write(json.dumps(example) + \"\\n\")\n",
+    "\n",
+    "print(f\"Subset saved to {output_file}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "⚙️  Training Hyperparameters\n",
+      "==================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "params = {\n",
+    "    ###########################################################################\n",
+    "    # 🤖 Model + Data Paths                                                   #\n",
+    "    ###########################################################################\n",
+    "    \"model_path\": \"Qwen/Qwen2.5-1.5B-Instruct\",\n",
+    "    \"data_path\": \"/opt/app-root/src/table-gpt-data/train/train_All_100.jsonl\",\n",
+    "    \"ckpt_output_dir\": \"/opt/app-root/src/checkpoints-logs-dir\",\n",
+    "    \"data_output_path\": \"/opt/app-root/src/osft-json/_data\",\n",
+    "    ############################################################################\n",
+    "    # 🏋️‍♀️ Training Hyperparameters                                              #\n",
+    "    ############################################################################\n",
+    "    # Important for OSFT\n",
+    "    \"unfreeze_rank_ratio\": 0.25,\n",
+    "    # Standard parameters\n",
+    "    \"effective_batch_size\": 128,\n",
+    "    \"learning_rate\": 5.0e-6,\n",
+    "    \"num_epochs\": 1,\n",
+    "    \"lr_scheduler\": \"cosine\",\n",
+    "    \"warmup_steps\": 0,\n",
+    "    \"seed\": 42,\n",
+    "    ###########################################################################\n",
+    "    # 🏎️ Performance Hyperparameters                                          #\n",
+    "    ###########################################################################\n",
+    "    \"use_liger\": True,\n",
+    "    \"max_tokens_per_gpu\": 32000,\n",
+    "    \"max_seq_len\": 2048,\n",
+    "    ############################################################################\n",
+    "    # 💾 Checkpointing Settings                                                #\n",
+    "    ############################################################################\n",
+    "    # Here we only want to save the very last checkpoint\n",
+    "    \"save_final_checkpoint\": True,\n",
+    "    \"checkpoint_at_epoch\": False,\n",
+    "    # \"nproc_per_node\": 2,\n",
+    "    # \"nnodes\": 2,\n",
+    "    # Please note that the distributed training parameters are removed because they are\n",
+    "    # delegated to Kubeflow Trainer\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kubeflow.trainer import TrainerClient\n",
+    "from kubeflow.trainer.rhai import TrainingHubAlgorithms\n",
+    "from kubeflow.trainer.rhai import TrainingHubTrainer\n",
+    "from kubeflow_trainer_api import models\n",
+    "from kubeflow.common.types import KubernetesBackendConfig\n",
+    "\n",
+    "backend_cfg = KubernetesBackendConfig(\n",
+    "    client_configuration=api_client.configuration,   # <— key part\n",
+    ")\n",
+    "\n",
+    "client = TrainerClient(backend_cfg)\n",
+    "print(client)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "th_runtime = None\n",
+    "for runtime in client.list_runtimes():\n",
+    "    if runtime.name == \"training-hub-2node-1gpu\":\n",
+    "        th_runtime = runtime\n",
+    "        print(\"Found runtime: \" + str(th_runtime))\n",
+    "        break\n",
+    "\n",
+    "if th_runtime is None:\n",
+    "    raise RuntimeError(\"Required runtime 'training-hub-2node-1gpu' not found\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from kubeflow.trainer.options.kubernetes import (\n",
+    "    PodTemplateOverrides,\n",
+    "    PodTemplateOverride,\n",
+    "    PodSpecOverride,\n",
+    "    ContainerOverride,\n",
+    ")\n",
+    "\n",
+    "cache_root = \"/opt/app-root/src/.cache/huggingface\"\n",
+    "triton_cache = \"/opt/app-root/src/.triton\"\n",
+    "\n",
+    "job_name = client.train(\n",
+    "    trainer=TrainingHubTrainer(\n",
+    "        algorithm=TrainingHubAlgorithms.OSFT,\n",
+    "        func_args=params,\n",
+    "        env={ \n",
+    "            \"HF_HOME\": cache_root,\n",
+    "            \"TRITON_CACHE_DIR\": triton_cache,\n",
+    "            \"XDG_CACHE_HOME\": \"/opt/app-root/src/.cache\",\n",
+    "            \"NCCL_DEBUG\": \"INFO\",\n",
+    "        },\n",
+    "    ),\n",
+    "    options=[\n",
+    "        PodTemplateOverrides(\n",
+    "            PodTemplateOverride(\n",
+    "                target_jobs=[\"node\"],\n",
+    "                spec=PodSpecOverride(\n",
+    "                    volumes=[\n",
+    "                        {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n",
+    "                    ],\n",
+    "                    containers=[\n",
+    "                        ContainerOverride(\n",
+    "                            name=\"node\", \n",
+    "                            volume_mounts=[\n",
+    "                                {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n",
+    "                            ],\n",
+    "                        )\n",
+    "                    ],\n",
+    "                ),\n",
+    "            )\n",
+    "        )\n",
+    "    ],\n",
+    "    runtime=th_runtime,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Wait for the running status, then completion.\n",
+    "client.wait_for_job_status(name=job_name, status={\"Running\"}, timeout=300)\n",
+    "client.wait_for_job_status(name=job_name, status={\"Complete\"}, timeout=600)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for c in client.get_job(name=job_name).steps:\n",
+    "    print(f\"Step: {c.name}, Status: {c.status}, Devices: {c.device} x {c.device_count}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for logline in client.get_job_logs(job_name, follow=False):\n",
+    "    print(logline)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.delete_job(job_name)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.12",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}