opendatahub-io · abhijeet-dhumal · Sep 24, 2025
diff --git a/examples/ray-kft-v1/1_ray_sdg.ipynb b/examples/ray-kft-v1/1_ray_sdg.ipynb
diff --git a/examples/ray-kft-v1/2_kft_training.ipynb b/examples/ray-kft-v1/2_kft_training.ipynb
@@ -0,0 +1,323 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d1c9049a-7daa-43aa-9e50-5f9f951a8324",
+   "metadata": {},
+   "source": [
+    "## Phase 2: Distributed Training using Kubeflow Training Operator and SDK\n",
+    "\n",
+    "- **kubeflow-training SDK**: PyTorchJob creation and management\n",
+    "- **TRL + PEFT**: Modern fine-tuning with LoRA adapters\n",
+    "- **Distributed Training**: Multi-node GPU coordination "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "035727e0",
+   "metadata": {},
+   "source": [
+    "### Training Configuration using kubeflow-training SDK"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92017175-8d63-4dbe-ac8d-f2724b57f9a8",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%pip install kubernetes yamlmagic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c82140d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext yamlmagic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7be28c8f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%yaml training_parameters\n",
+    "\n",
+    "# Model configuration\n",
+    "model_name_or_path: ibm-granite/granite-3.1-2b-instruct\n",
+    "model_revision: main\n",
+    "torch_dtype: bfloat16\n",
+    "attn_implementation: flash_attention_2\n",
+    "use_liger: false\n",
+    "\n",
+    "# PEFT / LoRA configuration\n",
+    "use_peft: true\n",
+    "lora_r: 16\n",
+    "lora_alpha: 16  # Changed from 8 to 16 for better scaling\n",
+    "lora_dropout: 0.05\n",
+    "lora_target_modules: [\"q_proj\", \"v_proj\", \"k_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"]\n",
+    "lora_modules_to_save: []\n",
+    "\n",
+    "# QLoRA (BitsAndBytes)\n",
+    "load_in_4bit: false\n",
+    "load_in_8bit: false\n",
+    "\n",
+    "# Dataset configuration (synthetic data from Ray preprocessing)\n",
+    "dataset_path: synthetic_gsm8k\n",
+    "dataset_config: main\n",
+    "dataset_train_split: train\n",
+    "dataset_test_split: test\n",
+    "dataset_text_field: text\n",
+    "dataset_kwargs:\n",
+    "  add_special_tokens: false\n",
+    "  append_concat_token: false\n",
+    "\n",
+    "# SFT configuration  # Fixed typo\n",
+    "max_seq_length: 1024\n",
+    "dataset_batch_size: 1000\n",
+    "packing: false\n",
+    "\n",
+    "# Training hyperparameters\n",
+    "num_train_epochs: 3\n",
+    "per_device_train_batch_size: 8\n",
+    "per_device_eval_batch_size: 8\n",
+    "auto_find_batch_size: false\n",
+    "eval_strategy: epoch\n",
+    "\n",
+    "# Precision and optimization\n",
+    "bf16: true\n",
+    "tf32: false\n",
+    "learning_rate: 1.0e-4  # Reduced from 2.0e-4 for more stable LoRA training\n",
+    "warmup_steps: 100      # Increased from 10 for better stability\n",
+    "lr_scheduler_type: inverse_sqrt\n",
+    "optim: adamw_torch_fused\n",
+    "max_grad_norm: 1.0\n",
+    "seed: 42\n",
+    "\n",
+    "# Gradient settings\n",
+    "gradient_accumulation_steps: 1\n",
+    "gradient_checkpointing: false\n",
+    "gradient_checkpointing_kwargs:\n",
+    "  use_reentrant: false\n",
+    "\n",
+    "# FSDP for distributed training\n",
+    "fsdp: \"full_shard auto_wrap\"\n",
+    "fsdp_config:\n",
+    "  activation_checkpointing: true\n",
+    "  cpu_ram_efficient_loading: false\n",
+    "  sync_module_states: true\n",
+    "  use_orig_params: true\n",
+    "  limit_all_gathers: false\n",
+    "\n",
+    "# Checkpointing and logging\n",
+    "save_strategy: epoch\n",
+    "save_total_limit: 1\n",
+    "resume_from_checkpoint: false\n",
+    "log_level: warning\n",
+    "logging_strategy: steps\n",
+    "logging_steps: 10      # Reduced frequency from 1 to 10\n",
+    "report_to:\n",
+    "- tensorboard\n",
+    "\n",
+    "output_dir: /shared/models/granite-3.1-2b-instruct-synthetic2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20521af6",
+   "metadata": {},
+   "source": [
+    "### Configure kubeflow-training Client\n",
+    "\n",
+    "Set up the kubeflow-training SDK client following the sft.ipynb pattern:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "deb20fde",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "kubeflow-training client configured\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Configure kubeflow-training client (following sft.ipynb pattern)\n",
+    "from kubernetes import client\n",
+    "from kubeflow.training import TrainingClient\n",
+    "from kubeflow.training.models import V1Volume, V1VolumeMount, V1PersistentVolumeClaimVolumeSource\n",
+    "\n",
+    "token=\"<auth_token>\"\n",
+    "api_server=\"<api_server_url>\"\n",
+    "\n",
+    "configuration = client.Configuration()\n",
+    "configuration.host = api_server\n",
+    "configuration.api_key = {\"authorization\": f\"Bearer {token}\"}\n",
+    "# Un-comment if your cluster API server uses a self-signed certificate or an un-trusted CA\n",
+    "configuration.verify_ssl = False\n",
+    "\n",
+    "api_client = client.ApiClient(configuration)\n",
+    "training_client = TrainingClient(client_configuration=api_client.configuration)\n",
+    "\n",
+    "print(\"kubeflow-training client configured\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "91d0b76b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PyTorchJob submitted successfully\n"
+     ]
+    }
+   ],
+   "source": [
+    "from scripts.kft_granite_training import training_func\n",
+    "\n",
+    "job = training_client.create_job(\n",
+    "    job_kind=\"PyTorchJob\",\n",
+    "    name=\"test1-training\",\n",
+    "    # Use script file instead of function import\n",
+    "    train_func=training_func,\n",
+    "    # Pass YAML parameters as config\n",
+    "    parameters=training_parameters,\n",
+    "    # Distributed training configuration\n",
+    "    num_workers=2,\n",
+    "    num_procs_per_worker=2,\n",
+    "    resources_per_worker={\n",
+    "        \"nvidia.com/gpu\": 2,  # Uncomment for GPU training\n",
+    "        \"memory\": \"24Gi\",\n",
+    "        \"cpu\": 4,\n",
+    "    },\n",
+    "    base_image=\"quay.io/modh/training:py311-cuda124-torch251\",\n",
+    "    # Environment variables for training\n",
+    "    env_vars={\n",
+    "        # HuggingFace configuration - use shared storage\n",
+    "        \"HF_HOME\": \"/shared/huggingface_cache\",\n",
+    "        \"HF_DATASETS_CACHE\": \"/shared/huggingface_cache/datasets\",\n",
+    "        \"TOKENIZERS_PARALLELISM\": \"false\",\n",
+    "        # Training configuration\n",
+    "        \"PYTHONUNBUFFERED\": \"1\",\n",
+    "        \"NCCL_DEBUG\": \"INFO\",\n",
+    "    },\n",
+    "    # Package dependencies\n",
+    "    packages_to_install=[\n",
+    "        \"transformers>=4.36.0\",\n",
+    "        \"trl>=0.7.0\",\n",
+    "        \"datasets>=2.14.0\",\n",
+    "        \"peft>=0.6.0\",\n",
+    "        \"accelerate>=0.24.0\",\n",
+    "        \"torch>=2.0.0\",\n",
+    "    ],\n",
+    "    volumes=[\n",
+    "        V1Volume(\n",
+    "            name=\"shared\",\n",
+    "            persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(claim_name=\"shared\")\n",
+    "        ),\n",
+    "    ],\n",
+    "    volume_mounts=[\n",
+    "        V1VolumeMount(name=\"shared\", mount_path=\"/shared\"),\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "print(f\"PyTorchJob submitted successfully\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08beef7d",
+   "metadata": {},
+   "source": [
+    "### Create Training Job using kubeflow-training SDK\n",
+    "\n",
+    "Create and submit the distributed training job following the sft.ipynb pattern:\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cac9307d",
+   "metadata": {},
+   "source": [
+    "### Monitor Training Job\n",
+    "\n",
+    "Follow the training progress and logs:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7f61439",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Monitor training job logs (following sft.ipynb pattern)\n",
+    "training_client.get_job_logs(\n",
+    "    name=\"test1-training\",\n",
+    "    job_kind=\"PyTorchJob\",\n",
+    "    follow=True,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8571ae47",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PytorchJob deleted!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Delete the Training Job\n",
+    "training_client.delete_job(\"test1-training\")\n",
+    "print(\"PytorchJob deleted!\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.12",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}