reworking inference stress test demonstrating FA

ebiser · ebiser · commit e7e7c1930266 · 2026-01-05T20:03:54.000-06:00
diff --git a/.env b/.env
@@ -41,18 +41,21 @@ USE_MULTI_GPU=true
 LATEST_ONLY=true
 
 # -----------------------------------------------------------------------------
-# Pod 6: Inference Benchmark Settings
+# Pod 6: FlashArray Model Reload Stress Test
 # -----------------------------------------------------------------------------
-# Duration of sustained throughput test per model (CPU and GPU)
+# Duration of stress test (runs single-threaded then multi-threaded)
 BENCHMARK_DURATION=60
 
-# Batch size for inference (records per batch)
-BENCHMARK_BATCH_SIZE=10000
-
-# Number of concurrent workers for GPU inference (uses gRPC)
-# More workers = better GPU utilization under concurrent load
+# Number of concurrent workers for multi-threaded test
+# More workers = more I/O pressure on FlashArray
 BENCHMARK_WORKERS=8
 
+# Run inference after each model load (simulates real usage)
+BENCHMARK_INFERENCE=true
+
+# Batch size for inference (if enabled)
+BENCHMARK_BATCH_SIZE=1000
+
 # -----------------------------------------------------------------------------
 # S3 Configuration (TODO - FlashBlade S3 Endpoint)
 # -----------------------------------------------------------------------------
diff --git a/Makefile b/Makefile
@@ -30,7 +30,8 @@ help:
 	@echo "  make build       Build all containers"
 	@echo "  make pipeline    Run full pipeline (pods 1-3)"
 	@echo "  make inference   Start inference server (pod 4)"
-	@echo "  make benchmark   Run sustained throughput benchmark (pod 6)"
+	@echo "  make benchmark   Run FlashArray model reload stress test (pod 6)"
+	@echo "  make benchmark-io  Run pure I/O stress test (no inference)"
 	@echo "  make test        Test inference endpoint"
 	@echo "  make stop        Stop all containers"
 	@echo "  make clean-data  Remove generated data"
@@ -41,19 +42,19 @@ help:
 	@echo "  make pod1        Run data generator"
 	@echo "  make pod2        Run feature engineering (CPU vs GPU comparison)"
 	@echo "  make pod3        Run model training"
-	@echo "  make pod6        Run inference benchmark (requires pod1 data + pod3 model)"
+	@echo "  make pod6        Run FlashArray stress test"
 	@echo ""
 	@echo "Configuration (from .env):"
 	@echo "  FB_DATA=$(FB_DATA)"
 	@echo "  FB_PREP=$(FB_PREP)"
 	@echo "  FA_MODEL_REPO=$(FA_MODEL_REPO)"
 	@echo "  DURATION_SECONDS=$(DURATION_SECONDS)s NUM_WORKERS=$(NUM_WORKERS)"
 	@echo ""
-	@echo "Benchmark options:"
+	@echo "Benchmark options (FlashArray stress test):"
 	@echo "  BENCHMARK_DURATION=$(BENCHMARK_DURATION)s"
-	@echo "  BENCHMARK_BATCH_SIZE=$(BENCHMARK_BATCH_SIZE)"
-	@echo "  BENCHMARK_WORKERS=$(BENCHMARK_WORKERS) (concurrent GPU workers)"
-	@echo "  Example: make benchmark BENCHMARK_DURATION=120 BENCHMARK_WORKERS=16"
+	@echo "  BENCHMARK_WORKERS=$(BENCHMARK_WORKERS) (concurrent model loaders)"
+	@echo "  BENCHMARK_INFERENCE=$(BENCHMARK_INFERENCE)"
+	@echo "  Example: make benchmark BENCHMARK_DURATION=120 BENCHMARK_WORKERS=32"
 
 # Verify environment and paths
 env-check:
@@ -150,65 +151,48 @@ inference:
 
 # Benchmark settings
 BENCHMARK_DURATION ?= 60
-BENCHMARK_BATCH_SIZE ?= 10000
 BENCHMARK_WORKERS ?= 8
+BENCHMARK_INFERENCE ?= true
+BENCHMARK_BATCH_SIZE ?= 1000
 
-# Run sustained throughput benchmark (CPU vs GPU)
+# Run FlashArray model reload stress test
 benchmark:
 	@echo ""
 	@echo "=========================================="
-	@echo "Sustained Throughput Benchmark: CPU vs GPU"
+	@echo "FlashArray Model Reload Stress Test"
 	@echo "=========================================="
-	@echo "Duration:    $(BENCHMARK_DURATION)s per model"
-	@echo "Batch size:  $(BENCHMARK_BATCH_SIZE) records"
-	@echo "GPU workers: $(BENCHMARK_WORKERS) concurrent (gRPC)"
+	@echo "Duration:       $(BENCHMARK_DURATION)s per test"
+	@echo "Workers:        $(BENCHMARK_WORKERS) concurrent"
+	@echo "Run inference:  $(BENCHMARK_INFERENCE)"
 	@echo ""
-	@if [ ! -d "$(FB_DATA)" ] || [ -z "$$(ls -A $(FB_DATA)/run_* 2>/dev/null)" ]; then \
-		echo "ERROR: No data found at $(FB_DATA)/run_*/"; \
-		echo "Run 'make pod1' or 'make pipeline' first to generate data."; \
-		exit 1; \
-	fi
 	@if [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost" ] && [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost_gpu" ] && [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost_cpu" ]; then \
 		echo "ERROR: Model not found at $(FA_MODEL_REPO)/"; \
 		echo "Expected: fraud_xgboost, fraud_xgboost_gpu, or fraud_xgboost_cpu"; \
 		echo "Run 'make pod3' or 'make pipeline' first to train the model."; \
 		exit 1; \
 	fi
 	@ls -d $(FA_MODEL_REPO)/fraud_xgboost* 2>/dev/null | head -1 | xargs -I{} echo "  Found model: {}"
-	@echo "Starting Triton server if not running..."
-	@docker compose up -d inference
-	@echo "Waiting for Triton to be ready..."
-	@for i in 1 2 3 4 5 6 7 8 9 10; do \
-		if curl -s http://localhost:8000/v2/health/ready > /dev/null 2>&1; then \
-			echo "  Triton ready!"; \
-			break; \
-		fi; \
-		echo "  Waiting... ($$i/10)"; \
-		sleep 3; \
-	done
 	@echo ""
-	BENCHMARK_DURATION=$(BENCHMARK_DURATION) BENCHMARK_BATCH_SIZE=$(BENCHMARK_BATCH_SIZE) BENCHMARK_WORKERS=$(BENCHMARK_WORKERS) docker compose run --rm benchmark
+	BENCHMARK_DURATION=$(BENCHMARK_DURATION) BENCHMARK_WORKERS=$(BENCHMARK_WORKERS) BENCHMARK_INFERENCE=$(BENCHMARK_INFERENCE) BENCHMARK_BATCH_SIZE=$(BENCHMARK_BATCH_SIZE) docker compose run --rm benchmark
 	@echo ""
-	@echo "Benchmark complete!"
+	@echo "Stress test complete!"
+	@echo "Check Grafana for FlashArray I/O metrics"
 
-# Run benchmark without Triton (CPU only)
-benchmark-cpu:
+# Run FlashArray stress test without inference (pure I/O)
+benchmark-io:
 	@echo ""
 	@echo "=========================================="
-	@echo "Sustained Throughput Benchmark: CPU Only"
+	@echo "FlashArray Pure I/O Stress Test"
 	@echo "=========================================="
-	@echo "Duration:   $(BENCHMARK_DURATION)s"
-	@echo "Batch size: $(BENCHMARK_BATCH_SIZE) records"
+	@echo "Duration:       $(BENCHMARK_DURATION)s per test"
+	@echo "Workers:        $(BENCHMARK_WORKERS) concurrent"
+	@echo "Run inference:  false (pure model load I/O)"
 	@echo ""
-	@if [ ! -d "$(FB_DATA)" ] || [ -z "$$(ls -A $(FB_DATA)/run_* 2>/dev/null)" ]; then \
-		echo "ERROR: No data found at $(FB_DATA)/run_*/"; \
-		exit 1; \
-	fi
 	@if [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost" ] && [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost_gpu" ] && [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost_cpu" ]; then \
 		echo "ERROR: Model not found at $(FA_MODEL_REPO)/"; \
 		exit 1; \
 	fi
-	BENCHMARK_DURATION=$(BENCHMARK_DURATION) BENCHMARK_BATCH_SIZE=$(BENCHMARK_BATCH_SIZE) docker compose run --rm -e TRITON_URL=http://localhost:9999 benchmark
+	BENCHMARK_DURATION=$(BENCHMARK_DURATION) BENCHMARK_WORKERS=$(BENCHMARK_WORKERS) BENCHMARK_INFERENCE=false docker compose run --rm benchmark
 
 # Test inference
 test:
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -154,31 +154,28 @@ services:
       retries: 3
 
   # ============================================================================
-  # Pod 6: Inference Benchmark
-  # Sustained throughput test comparing CPU (XGBoost) vs GPU (Triton gRPC)
-  # Uses concurrent workers to demonstrate GPU parallelism advantage
-  # Input:  ${FB_DATA}/run_*/*.parquet (raw data from Pod 1)
-  #         ${FA_MODEL_REPO}/fraud_xgboost*/ (model from Pod 3)
-  # Output: Console performance comparison
+  # Pod 6: FlashArray Model Reload Stress Test
+  # Demonstrates low-latency model serving from Pure Storage FlashArray
+  # Continuously loads models to stress test storage I/O
+  # Input:  ${FA_MODEL_REPO}/fraud_xgboost*/ (model from Pod 3)
+  #         ${FB_DATA}/run_*/*.parquet (test data from Pod 1)
+  # Output: Console performance metrics
   # ============================================================================
   benchmark:
     build: ./pods/benchmark
     container_name: fraud-benchmark
     volumes:
-      # Read raw data from Pod 1
-      - ${FB_DATA}:/data/input:ro
-      # Read model from Pod 3 output
+      # Model repository on FlashArray (stress test target)
       - ${FA_MODEL_REPO}:/data/models:ro
+      # Data from FlashBlade (for test inference)
+      - ${FB_DATA}:/data/input:ro
     environment:
-      - DATA_DIR=/data/input
       - MODEL_DIR=/data/models
-      - TRITON_URL=http://inference:8000
+      - DATA_DIR=/data/input
       - DURATION_SECONDS=${BENCHMARK_DURATION:-60}
-      - BATCH_SIZE=${BENCHMARK_BATCH_SIZE:-10000}
       - NUM_WORKERS=${BENCHMARK_WORKERS:-8}
-    depends_on:
-      inference:
-        condition: service_healthy
+      - RUN_INFERENCE=${BENCHMARK_INFERENCE:-true}
+      - INFERENCE_BATCH_SIZE=${BENCHMARK_BATCH_SIZE:-1000}
     networks:
       - fraud-net
 
diff --git a/pods/benchmark/Dockerfile b/pods/benchmark/Dockerfile
@@ -1,28 +1,26 @@
-# Pod 6: Sustained Throughput Benchmark
-# Compares CPU (XGBoost) vs GPU (Triton gRPC) inference
+# Pod 6: FlashArray Model Reload Stress Test
+# Demonstrates low-latency model serving from Pure Storage FlashArray
 
 FROM python:3.11-slim
 
 WORKDIR /app
 
-# Install dependencies including Triton client for gRPC
+# Install dependencies
 RUN pip install --no-cache-dir \
     numpy>=1.26.0 \
     pandas>=2.1.0 \
     pyarrow>=14.0.0 \
-    xgboost>=2.0.0 \
-    requests>=2.31.0 \
-    tritonclient[grpc]>=2.42.0
+    xgboost>=2.0.0
 
 COPY benchmark.py .
 
 # Configuration
-ENV DATA_DIR=/data/input
 ENV MODEL_DIR=/data/models
-ENV TRITON_URL=http://inference:8000
+ENV DATA_DIR=/data/input
 ENV DURATION_SECONDS=60
-ENV BATCH_SIZE=10000
 ENV NUM_WORKERS=8
+ENV RUN_INFERENCE=true
+ENV INFERENCE_BATCH_SIZE=1000
 ENV PYTHONUNBUFFERED=1
 
 CMD ["python", "benchmark.py"]
diff --git a/pods/benchmark/benchmark.py b/pods/benchmark/benchmark.py
diff --git a/pods/benchmark/requirements.txt b/pods/benchmark/requirements.txt