Adding s3 support for osft e2e test

Fiona-Waters · openshift-merge-bot[bot] · commit 663471754553 · 2025-12-15T13:59:35.000Z
diff --git a/tests/trainer/resources/osft.ipynb b/tests/trainer/resources/osft.ipynb
@@ -43,61 +43,255 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import json\n",
-    "import random\n",
+    "import os\n",
+    "import gzip\n",
+    "import shutil\n",
+    "import socket\n",
+    "import time\n",
+    "\n",
+    "import boto3\n",
+    "from botocore.config import Config as BotoConfig\n",
+    "from botocore.exceptions import ClientError\n",
+    "\n",
+    "# --- Global networking safety net: cap all socket operations ---\n",
+    "socket.setdefaulttimeout(10)  # seconds\n",
+    "\n",
+    "# Notebook's PVC mount path (per Notebook CR). Training pods will mount the same PVC at /opt/app-root/src\n",
+    "PVC_NOTEBOOK_PATH = \"/opt/app-root/src\"\n",
+    "DATASET_ROOT_NOTEBOOK = PVC_NOTEBOOK_PATH\n",
+    "TABLE_GPT_DIR = os.path.join(DATASET_ROOT_NOTEBOOK, \"table-gpt-data\", \"train\")\n",
+    "MODEL_DIR = os.path.join(DATASET_ROOT_NOTEBOOK, \"Qwen\", \"Qwen2.5-1.5B-Instruct\")\n",
+    "os.makedirs(TABLE_GPT_DIR, exist_ok=True)\n",
+    "os.makedirs(MODEL_DIR, exist_ok=True)\n",
+    "\n",
+    "# Env config for S3/MinIO\n",
+    "s3_endpoint = os.getenv(\"AWS_DEFAULT_ENDPOINT\", \"\")\n",
+    "s3_access_key = os.getenv(\"AWS_ACCESS_KEY_ID\", \"\")\n",
+    "s3_secret_key = os.getenv(\"AWS_SECRET_ACCESS_KEY\", \"\")\n",
+    "s3_bucket = os.getenv(\"AWS_STORAGE_BUCKET\", \"\")\n",
+    "s3_prefix = os.getenv(\"AWS_STORAGE_BUCKET_DATA_DIR\", \"\")  # e.g. \"osft-data\"\n",
+    "\n",
+    "def stream_download(s3, bucket, key, dst):\n",
+    "    \"\"\"\n",
+    "    Download an object from S3/MinIO using get_object and streaming reads.\n",
+    "    Returns True on success, False on any error.\n",
+    "    \"\"\"\n",
+    "    print(f\"[notebook] STREAM download s3://{bucket}/{key} -> {dst}\")\n",
+    "    t0 = time.time()\n",
+    "\n",
+    "    try:\n",
+    "        resp = s3.get_object(Bucket=bucket, Key=key)\n",
+    "    except ClientError as e:\n",
+    "        err = e.response.get(\"Error\", {})\n",
+    "        print(f\"[notebook] CLIENT ERROR (get_object) for {key}: {err}\")\n",
+    "        return False\n",
+    "    except Exception as e:\n",
+    "        print(f\"[notebook] OTHER ERROR (get_object) for {key}: {e}\")\n",
+    "        return False\n",
+    "\n",
+    "    body = resp[\"Body\"]\n",
+    "    try:\n",
+    "        with open(dst, \"wb\") as f:\n",
+    "            while True:\n",
+    "                try:\n",
+    "                    chunk = body.read(1024 * 1024)  # 1MB per chunk\n",
+    "                except socket.timeout as e:\n",
+    "                    print(f\"[notebook] socket.timeout while reading {key}: {e}\")\n",
+    "                    return False\n",
+    "                if not chunk:\n",
+    "                    break\n",
+    "                f.write(chunk)\n",
+    "    except Exception as e:\n",
+    "        print(f\"[notebook] ERROR writing to {dst} for {key}: {e}\")\n",
+    "        return False\n",
+    "\n",
+    "    t1 = time.time()\n",
+    "    print(f\"[notebook] DONE  stream {key} in {t1 - t0:.2f}s\")\n",
+    "    return True\n",
+    "\n",
+    "\n",
+    "if s3_endpoint and s3_bucket:\n",
+    "    try:\n",
+    "        # Normalize endpoint URL\n",
+    "        endpoint_url = (\n",
+    "            s3_endpoint\n",
+    "            if s3_endpoint.startswith(\"http\")\n",
+    "            else f\"https://{s3_endpoint}\"\n",
+    "        )\n",
+    "        prefix = (s3_prefix or \"\").strip(\"/\")\n",
+    "\n",
+    "        print(\n",
+    "            f\"S3 configured (boto3, notebook): \"\n",
+    "            f\"endpoint={endpoint_url}, bucket={s3_bucket}, prefix={prefix or '<root>'}\"\n",
+    "        )\n",
+    "\n",
+    "        # Boto config: single attempt, reasonable connect/read timeouts\n",
+    "        boto_cfg = BotoConfig(\n",
+    "            signature_version=\"s3v4\",\n",
+    "            s3={\"addressing_style\": \"path\"},\n",
+    "            retries={\"max_attempts\": 1, \"mode\": \"standard\"},\n",
+    "            connect_timeout=5,\n",
+    "            read_timeout=10,\n",
+    "        )\n",
     "\n",
-    "from datasets import load_dataset\n",
+    "        # Create S3/MinIO client\n",
+    "        s3 = boto3.client(\n",
+    "            \"s3\",\n",
+    "            endpoint_url=endpoint_url,\n",
+    "            aws_access_key_id=s3_access_key,\n",
+    "            aws_secret_access_key=s3_secret_key,\n",
+    "            config=boto_cfg,\n",
+    "            verify=False,\n",
+    "        )\n",
+    "\n",
+    "        # List and download all objects under the prefix\n",
+    "        paginator = s3.get_paginator(\"list_objects_v2\")\n",
+    "        pulled_any = False\n",
+    "        file_count = 0\n",
+    "\n",
+    "        print(f\"[notebook] Starting S3 download from prefix: {prefix}\")\n",
+    "        for page in paginator.paginate(Bucket=s3_bucket, Prefix=prefix or \"\"):\n",
+    "            contents = page.get(\"Contents\", [])\n",
+    "            if not contents:\n",
+    "                print(f\"[notebook] No contents found in this page\")\n",
+    "                continue\n",
+    "            \n",
+    "            print(f\"[notebook] Found {len(contents)} objects in this page\")\n",
+    "\n",
+    "            for obj in contents:\n",
+    "                key = obj[\"Key\"]\n",
+    "                file_count += 1\n",
+    "\n",
+    "                # Skip \"directory markers\"\n",
+    "                if key.endswith(\"/\"):\n",
+    "                    print(f\"[notebook] Skipping directory marker: {key}\")\n",
+    "                    continue\n",
+    "\n",
+    "                # Determine relative path under prefix for local storage\n",
+    "                rel = key[len(prefix):].lstrip(\"/\") if prefix else key\n",
+    "                print(f\"[notebook] Processing key={key}, rel={rel}\")\n",
+    "                \n",
+    "                # Route to appropriate directory based on content type\n",
+    "                if \"table-gpt\" in rel.lower() or rel.endswith(\".jsonl\"):\n",
+    "                    dst = os.path.join(TABLE_GPT_DIR, os.path.basename(rel))\n",
+    "                    print(f\"[notebook] Routing to dataset dir: {dst}\")\n",
+    "                elif \"qwen\" in rel.lower() or any(rel.endswith(ext) for ext in [\".bin\", \".json\", \".model\", \".safetensors\", \".txt\"]):\n",
+    "                    # Preserve directory structure for model files\n",
+    "                    dst = os.path.join(MODEL_DIR, rel.split(\"Qwen2.5-1.5B-Instruct/\")[-1] if \"Qwen2.5-1.5B-Instruct\" in rel else os.path.basename(rel))\n",
+    "                    print(f\"[notebook] Routing to model dir: {dst}\")\n",
+    "                else:\n",
+    "                    # Default: use the relative path as-is\n",
+    "                    dst = os.path.join(DATASET_ROOT_NOTEBOOK, rel)\n",
+    "                    print(f\"[notebook] Routing to default dir: {dst}\")\n",
+    "                \n",
+    "                os.makedirs(os.path.dirname(dst), exist_ok=True)\n",
+    "\n",
+    "                # Download only if missing\n",
+    "                if not os.path.exists(dst):\n",
+    "                    ok = stream_download(s3, s3_bucket, key, dst)\n",
+    "                    if not ok:\n",
+    "                        print(f\"[notebook] Download failed for {key}\")\n",
+    "                        continue\n",
+    "                    pulled_any = True\n",
+    "                else:\n",
+    "                    print(f\"[notebook] Skipping existing file {dst}\")\n",
+    "                    pulled_any = True\n",
+    "\n",
+    "                # If the file is .gz, decompress and remove the .gz\n",
+    "                if dst.endswith(\".gz\") and os.path.exists(dst):\n",
+    "                    out_path = os.path.splitext(dst)[0]\n",
+    "                    if not os.path.exists(out_path):\n",
+    "                        print(f\"[notebook] Decompressing {dst} -> {out_path}\")\n",
+    "                        try:\n",
+    "                            with gzip.open(dst, \"rb\") as f_in, open(out_path, \"wb\") as f_out:\n",
+    "                                shutil.copyfileobj(f_in, f_out)\n",
+    "                        except Exception as e:\n",
+    "                            print(f\"[notebook] Failed to decompress {dst}: {e}\")\n",
+    "                        else:\n",
+    "                            try:\n",
+    "                                os.remove(dst)\n",
+    "                            except Exception:\n",
+    "                                pass\n",
+    "\n",
+    "        print(f\"[notebook] S3 download complete. Processed {file_count} files, pulled_any={pulled_any}\")\n",
     "\n",
-    "# Load the Table-GPT dataset\n",
-    "print(\"Loading Table-GPT dataset...\")\n",
-    "dataset = load_dataset(\"LipengCS/Table-GPT\", \"All\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"[notebook] S3 fetch failed: {e}\")\n",
+    "        import traceback\n",
+    "        traceback.print_exc()\n",
+    "else:\n",
+    "    print(\"[notebook] S3 not configured: missing endpoint or bucket env vars\")\n",
+    "    # Fallback to HuggingFace if S3 is not configured\n",
+    "    print(\"[notebook] Falling back to HuggingFace dataset download...\")\n",
+    "    import json\n",
+    "    import random\n",
+    "    from datasets import load_dataset\n",
     "\n",
-    "# Get the training split and create a random subset of 100 samples\n",
-    "train_data = dataset[\"train\"]\n",
-    "print(f\"Original training set size: {len(train_data)}\")\n",
+    "    # Load the Table-GPT dataset\n",
+    "    print(\"Loading Table-GPT dataset...\")\n",
+    "    dataset = load_dataset(\"LipengCS/Table-GPT\", \"All\")\n",
     "\n",
-    "# Create a random subset of 100 samples\n",
-    "random.seed(42)  # For reproducibility\n",
-    "subset_indices = random.sample(range(len(train_data)), min(100, len(train_data)))\n",
-    "subset_data = train_data.select(subset_indices)\n",
+    "    # Get the training split and create a random subset of 100 samples\n",
+    "    train_data = dataset[\"train\"]\n",
+    "    print(f\"Original training set size: {len(train_data)}\")\n",
     "\n",
-    "print(f\"Subset size: {len(subset_data)}\")\n",
+    "    # Create a random subset of 100 samples\n",
+    "    random.seed(42)  # For reproducibility\n",
+    "    subset_indices = random.sample(range(len(train_data)), min(100, len(train_data)))\n",
+    "    subset_data = train_data.select(subset_indices)\n",
     "\n",
-    "# Save the subset to a JSONL file\n",
-    "# Save the subset to a JSONL file - USE ABSOLUTE PATH\n",
-    "output_dir = \"table-gpt-data/train\"\n",
-    "output_file = f\"{output_dir}/train_All_100.jsonl\"\n",
+    "    print(f\"Subset size: {len(subset_data)}\")\n",
     "\n",
-    "print(f\"Creating directory: {output_dir}\")\n",
-    "os.makedirs(output_dir, exist_ok=True)\n",
+    "    # Save the subset to a JSONL file\n",
+    "    output_file = os.path.join(TABLE_GPT_DIR, \"train_All_100.jsonl\")\n",
+    "    with open(output_file, \"w\") as f:\n",
+    "        for example in subset_data:\n",
+    "            f.write(json.dumps(example) + \"\\n\")\n",
     "\n",
-    "with open(output_file, \"w\") as f:\n",
-    "    for example in subset_data:\n",
-    "        f.write(json.dumps(example) + \"\\n\")\n",
+    "    print(f\"Subset saved to {output_file}\")\n",
     "\n",
-    "print(f\"Subset saved to {output_file}\")"
+    "# Verify dataset file exists\n",
+    "dataset_file = os.path.join(TABLE_GPT_DIR, \"train_All_100.jsonl\")\n",
+    "if os.path.exists(dataset_file):\n",
+    "    print(f\"[notebook] Dataset ready: {dataset_file}\")\n",
+    "else:\n",
+    "    raise RuntimeError(f\"Dataset file not found: {dataset_file}\")\n",
+    "\n",
+    "# Verify model directory has files\n",
+    "if os.path.exists(MODEL_DIR) and os.listdir(MODEL_DIR):\n",
+    "    print(f\"[notebook] Model files ready in: {MODEL_DIR}\")\n",
+    "    print(f\"[notebook] Model files: {os.listdir(MODEL_DIR)[:5]}...\")  # Show first 5 files\n",
+    "else:\n",
+    "    print(f\"[notebook] Warning: Model directory is empty or missing: {MODEL_DIR}\")\n",
+    "    print(\"[notebook] Training will attempt to download from HuggingFace during execution\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "⚙️  Training Hyperparameters\n",
-      "==================================================\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
+    "# Determine model path based on whether S3 download succeeded\n",
+    "import os\n",
+    "LOCAL_MODEL_PATH = \"/opt/app-root/src/Qwen/Qwen2.5-1.5B-Instruct\"\n",
+    "HUGGINGFACE_MODEL_ID = \"Qwen/Qwen2.5-1.5B-Instruct\"\n",
+    "\n",
+    "# Check if model was downloaded from S3\n",
+    "model_downloaded = os.path.exists(LOCAL_MODEL_PATH) and len(os.listdir(LOCAL_MODEL_PATH)) > 0\n",
+    "\n",
+    "if model_downloaded:\n",
+    "    model_path_to_use = LOCAL_MODEL_PATH\n",
+    "    print(f\"✓ Using local model from S3: {model_path_to_use}\")\n",
+    "else:\n",
+    "    model_path_to_use = HUGGINGFACE_MODEL_ID  \n",
+    "    print(f\"✓ Using HuggingFace model ID: {model_path_to_use}\")\n",
+    "\n",
     "params = {\n",
     "    ###########################################################################\n",
     "    # 🤖 Model + Data Paths                                                   #\n",
     "    ###########################################################################\n",
-    "    \"model_path\": \"Qwen/Qwen2.5-1.5B-Instruct\",\n",
+    "    \"model_path\": model_path_to_use,\n",
     "    \"data_path\": \"/opt/app-root/src/table-gpt-data/train/train_All_100.jsonl\",\n",
     "    \"ckpt_output_dir\": \"/opt/app-root/src/checkpoints-logs-dir\",\n",
     "    \"data_output_path\": \"/opt/app-root/src/osft-json/_data\",\n",
@@ -227,9 +421,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Wait for the running status, then completion.\n",
+    "# Wait for the running status, then wait for completion or failure\n",
+    "# Using reasonable timeout for OSFT training\n",
     "client.wait_for_job_status(name=job_name, status={\"Running\"}, timeout=300)\n",
-    "client.wait_for_job_status(name=job_name, status={\"Complete\"}, timeout=600)"
+    "client.wait_for_job_status(name=job_name, status={\"Complete\", \"Failed\"}, timeout=1800)  # 30 minutes for training\n",
+    "\n",
+    "# Check if the job succeeded\n",
+    "job = client.get_job(name=job_name)\n",
+    "\n",
+    "# Check for success: status should be \"Complete\" and not \"Failed\"\n",
+    "if job.status == \"Failed\":\n",
+    "    print(f\"ERROR: Training job failed\")\n",
+    "    raise RuntimeError(f\"Training job failed with status: {job.status}\")\n",
+    "elif job.status == \"Complete\":\n",
+    "    print(\"✓ Training job completed successfully\")\n",
+    "else:\n",
+    "    # Unexpected status\n",
+    "    print(f\"ERROR: Unexpected job status: {job.status}\")\n",
+    "    raise RuntimeError(f\"Training job ended with unexpected status: {job.status}\")"
    ]
   },
   {
diff --git a/tests/trainer/sdk_tests/osft_traininghub_tests.go b/tests/trainer/sdk_tests/osft_traininghub_tests.go
@@ -61,6 +61,19 @@ func RunOsftTrainingHubMultiGpuDistributedTraining(t *testing.T) {
 	test.Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("failed to read notebook: %s", localPath))
 	cm := support.CreateConfigMap(test, namespace.Name, map[string][]byte{osftNotebookName: nb})
 
+	// Build command with parameters and pinned deps, and print definitive status line to logs
+	endpoint, endpointOK := support.GetStorageBucketDefaultEndpoint()
+	accessKey, _ := support.GetStorageBucketAccessKeyId()
+	secretKey, _ := support.GetStorageBucketSecretKey()
+	bucket, bucketOK := support.GetStorageBucketName()
+	prefix, _ := support.GetStorageBucketMnistDir()
+	if !endpointOK {
+		endpoint = ""
+	}
+	if !bucketOK {
+		bucket = ""
+	}
+
 	// Create RWX PVC for shared dataset and pass the claim name to the notebook
 	storageClass, err := support.GetRWXStorageClass(test)
 	test.Expect(err).NotTo(HaveOccurred(), "Failed to find an RWX supporting StorageClass")
@@ -77,10 +90,15 @@ func RunOsftTrainingHubMultiGpuDistributedTraining(t *testing.T) {
 			"export OPENSHIFT_API_URL='%s'; export NOTEBOOK_USER_TOKEN='%s'; "+
 			"export NOTEBOOK_NAMESPACE='%s'; "+
 			"export SHARED_PVC_NAME='%s'; "+
+			"export AWS_DEFAULT_ENDPOINT='%s'; export AWS_ACCESS_KEY_ID='%s'; "+
+			"export AWS_SECRET_ACCESS_KEY='%s'; export AWS_STORAGE_BUCKET='%s'; "+
+			"export AWS_STORAGE_BUCKET_DATA_DIR='%s'; "+
 			"python -m pip install --quiet --no-cache-dir --break-system-packages papermill boto3==1.34.162 git+https://github.com/opendatahub-io/kubeflow-sdk.git@main && "+
 			"if python -m papermill -k python3 /opt/app-root/notebooks/%s /opt/app-root/src/out.ipynb --log-output; "+
 			"then echo 'NOTEBOOK_STATUS: SUCCESS'; else echo 'NOTEBOOK_STATUS: FAILURE'; fi; sleep infinity",
-		support.GetOpenShiftApiUrl(test), userToken, namespace.Name, rwxPvc.Name, osftNotebookName,
+		support.GetOpenShiftApiUrl(test), userToken, namespace.Name, rwxPvc.Name,
+		endpoint, accessKey, secretKey, bucket, prefix,
+		osftNotebookName,
 	)
 	command := []string{"/bin/sh", "-c", shellCmd}
 
@@ -98,7 +116,6 @@ func RunOsftTrainingHubMultiGpuDistributedTraining(t *testing.T) {
 	podName, containerName := trainerutils.WaitForNotebookPodRunning(test, namespace.Name)
 
 	// Poll logs to check if the notebook execution completed successfully
-	// Use extra long timeout for multi-GPU distributed training
 	err = trainerutils.PollNotebookLogsForStatus(test, namespace.Name, podName, containerName, support.TestTimeoutDouble)
 	test.Expect(err).ShouldNot(HaveOccurred(), "Notebook execution reported FAILURE")
 }