Skip to content

Commit e7e7c19

Browse files
committed
reworking inference stress test demonstrating FA
1 parent d76176f commit e7e7c19

File tree

6 files changed

+368
-515
lines changed

6 files changed

+368
-515
lines changed

.env

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,21 @@ USE_MULTI_GPU=true
4141
LATEST_ONLY=true
4242

4343
# -----------------------------------------------------------------------------
44-
# Pod 6: Inference Benchmark Settings
44+
# Pod 6: FlashArray Model Reload Stress Test
4545
# -----------------------------------------------------------------------------
46-
# Duration of sustained throughput test per model (CPU and GPU)
46+
# Duration of stress test (runs single-threaded then multi-threaded)
4747
BENCHMARK_DURATION=60
4848

49-
# Batch size for inference (records per batch)
50-
BENCHMARK_BATCH_SIZE=10000
51-
52-
# Number of concurrent workers for GPU inference (uses gRPC)
53-
# More workers = better GPU utilization under concurrent load
49+
# Number of concurrent workers for multi-threaded test
50+
# More workers = more I/O pressure on FlashArray
5451
BENCHMARK_WORKERS=8
5552

53+
# Run inference after each model load (simulates real usage)
54+
BENCHMARK_INFERENCE=true
55+
56+
# Batch size for inference (if enabled)
57+
BENCHMARK_BATCH_SIZE=1000
58+
5659
# -----------------------------------------------------------------------------
5760
# S3 Configuration (TODO - FlashBlade S3 Endpoint)
5861
# -----------------------------------------------------------------------------

Makefile

Lines changed: 24 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ help:
3030
@echo " make build Build all containers"
3131
@echo " make pipeline Run full pipeline (pods 1-3)"
3232
@echo " make inference Start inference server (pod 4)"
33-
@echo " make benchmark Run sustained throughput benchmark (pod 6)"
33+
@echo " make benchmark Run FlashArray model reload stress test (pod 6)"
34+
@echo " make benchmark-io Run pure I/O stress test (no inference)"
3435
@echo " make test Test inference endpoint"
3536
@echo " make stop Stop all containers"
3637
@echo " make clean-data Remove generated data"
@@ -41,19 +42,19 @@ help:
4142
@echo " make pod1 Run data generator"
4243
@echo " make pod2 Run feature engineering (CPU vs GPU comparison)"
4344
@echo " make pod3 Run model training"
44-
@echo " make pod6 Run inference benchmark (requires pod1 data + pod3 model)"
45+
@echo " make pod6 Run FlashArray stress test"
4546
@echo ""
4647
@echo "Configuration (from .env):"
4748
@echo " FB_DATA=$(FB_DATA)"
4849
@echo " FB_PREP=$(FB_PREP)"
4950
@echo " FA_MODEL_REPO=$(FA_MODEL_REPO)"
5051
@echo " DURATION_SECONDS=$(DURATION_SECONDS)s NUM_WORKERS=$(NUM_WORKERS)"
5152
@echo ""
52-
@echo "Benchmark options:"
53+
@echo "Benchmark options (FlashArray stress test):"
5354
@echo " BENCHMARK_DURATION=$(BENCHMARK_DURATION)s"
54-
@echo " BENCHMARK_BATCH_SIZE=$(BENCHMARK_BATCH_SIZE)"
55-
@echo " BENCHMARK_WORKERS=$(BENCHMARK_WORKERS) (concurrent GPU workers)"
56-
@echo " Example: make benchmark BENCHMARK_DURATION=120 BENCHMARK_WORKERS=16"
55+
@echo " BENCHMARK_WORKERS=$(BENCHMARK_WORKERS) (concurrent model loaders)"
56+
@echo " BENCHMARK_INFERENCE=$(BENCHMARK_INFERENCE)"
57+
@echo " Example: make benchmark BENCHMARK_DURATION=120 BENCHMARK_WORKERS=32"
5758

5859
# Verify environment and paths
5960
env-check:
@@ -150,65 +151,48 @@ inference:
150151

151152
# Benchmark settings
152153
BENCHMARK_DURATION ?= 60
153-
BENCHMARK_BATCH_SIZE ?= 10000
154154
BENCHMARK_WORKERS ?= 8
155+
BENCHMARK_INFERENCE ?= true
156+
BENCHMARK_BATCH_SIZE ?= 1000
155157

156-
# Run sustained throughput benchmark (CPU vs GPU)
158+
# Run FlashArray model reload stress test
157159
benchmark:
158160
@echo ""
159161
@echo "=========================================="
160-
@echo "Sustained Throughput Benchmark: CPU vs GPU"
162+
@echo "FlashArray Model Reload Stress Test"
161163
@echo "=========================================="
162-
@echo "Duration: $(BENCHMARK_DURATION)s per model"
163-
@echo "Batch size: $(BENCHMARK_BATCH_SIZE) records"
164-
@echo "GPU workers: $(BENCHMARK_WORKERS) concurrent (gRPC)"
164+
@echo "Duration: $(BENCHMARK_DURATION)s per test"
165+
@echo "Workers: $(BENCHMARK_WORKERS) concurrent"
166+
@echo "Run inference: $(BENCHMARK_INFERENCE)"
165167
@echo ""
166-
@if [ ! -d "$(FB_DATA)" ] || [ -z "$$(ls -A $(FB_DATA)/run_* 2>/dev/null)" ]; then \
167-
echo "ERROR: No data found at $(FB_DATA)/run_*/"; \
168-
echo "Run 'make pod1' or 'make pipeline' first to generate data."; \
169-
exit 1; \
170-
fi
171168
@if [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost" ] && [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost_gpu" ] && [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost_cpu" ]; then \
172169
echo "ERROR: Model not found at $(FA_MODEL_REPO)/"; \
173170
echo "Expected: fraud_xgboost, fraud_xgboost_gpu, or fraud_xgboost_cpu"; \
174171
echo "Run 'make pod3' or 'make pipeline' first to train the model."; \
175172
exit 1; \
176173
fi
177174
@ls -d $(FA_MODEL_REPO)/fraud_xgboost* 2>/dev/null | head -1 | xargs -I{} echo " Found model: {}"
178-
@echo "Starting Triton server if not running..."
179-
@docker compose up -d inference
180-
@echo "Waiting for Triton to be ready..."
181-
@for i in 1 2 3 4 5 6 7 8 9 10; do \
182-
if curl -s http://localhost:8000/v2/health/ready > /dev/null 2>&1; then \
183-
echo " Triton ready!"; \
184-
break; \
185-
fi; \
186-
echo " Waiting... ($$i/10)"; \
187-
sleep 3; \
188-
done
189175
@echo ""
190-
BENCHMARK_DURATION=$(BENCHMARK_DURATION) BENCHMARK_BATCH_SIZE=$(BENCHMARK_BATCH_SIZE) BENCHMARK_WORKERS=$(BENCHMARK_WORKERS) docker compose run --rm benchmark
176+
BENCHMARK_DURATION=$(BENCHMARK_DURATION) BENCHMARK_WORKERS=$(BENCHMARK_WORKERS) BENCHMARK_INFERENCE=$(BENCHMARK_INFERENCE) BENCHMARK_BATCH_SIZE=$(BENCHMARK_BATCH_SIZE) docker compose run --rm benchmark
191177
@echo ""
192-
@echo "Benchmark complete!"
178+
@echo "Stress test complete!"
179+
@echo "Check Grafana for FlashArray I/O metrics"
193180

194-
# Run benchmark without Triton (CPU only)
195-
benchmark-cpu:
181+
# Run FlashArray stress test without inference (pure I/O)
182+
benchmark-io:
196183
@echo ""
197184
@echo "=========================================="
198-
@echo "Sustained Throughput Benchmark: CPU Only"
185+
@echo "FlashArray Pure I/O Stress Test"
199186
@echo "=========================================="
200-
@echo "Duration: $(BENCHMARK_DURATION)s"
201-
@echo "Batch size: $(BENCHMARK_BATCH_SIZE) records"
187+
@echo "Duration: $(BENCHMARK_DURATION)s per test"
188+
@echo "Workers: $(BENCHMARK_WORKERS) concurrent"
189+
@echo "Run inference: false (pure model load I/O)"
202190
@echo ""
203-
@if [ ! -d "$(FB_DATA)" ] || [ -z "$$(ls -A $(FB_DATA)/run_* 2>/dev/null)" ]; then \
204-
echo "ERROR: No data found at $(FB_DATA)/run_*/"; \
205-
exit 1; \
206-
fi
207191
@if [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost" ] && [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost_gpu" ] && [ ! -d "$(FA_MODEL_REPO)/fraud_xgboost_cpu" ]; then \
208192
echo "ERROR: Model not found at $(FA_MODEL_REPO)/"; \
209193
exit 1; \
210194
fi
211-
BENCHMARK_DURATION=$(BENCHMARK_DURATION) BENCHMARK_BATCH_SIZE=$(BENCHMARK_BATCH_SIZE) docker compose run --rm -e TRITON_URL=http://localhost:9999 benchmark
195+
BENCHMARK_DURATION=$(BENCHMARK_DURATION) BENCHMARK_WORKERS=$(BENCHMARK_WORKERS) BENCHMARK_INFERENCE=false docker compose run --rm benchmark
212196

213197
# Test inference
214198
test:

docker-compose.yaml

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -154,31 +154,28 @@ services:
154154
retries: 3
155155

156156
# ============================================================================
157-
# Pod 6: Inference Benchmark
158-
# Sustained throughput test comparing CPU (XGBoost) vs GPU (Triton gRPC)
159-
# Uses concurrent workers to demonstrate GPU parallelism advantage
160-
# Input: ${FB_DATA}/run_*/*.parquet (raw data from Pod 1)
161-
# ${FA_MODEL_REPO}/fraud_xgboost*/ (model from Pod 3)
162-
# Output: Console performance comparison
157+
# Pod 6: FlashArray Model Reload Stress Test
158+
# Demonstrates low-latency model serving from Pure Storage FlashArray
159+
# Continuously loads models to stress test storage I/O
160+
# Input: ${FA_MODEL_REPO}/fraud_xgboost*/ (model from Pod 3)
161+
# ${FB_DATA}/run_*/*.parquet (test data from Pod 1)
162+
# Output: Console performance metrics
163163
# ============================================================================
164164
benchmark:
165165
build: ./pods/benchmark
166166
container_name: fraud-benchmark
167167
volumes:
168-
# Read raw data from Pod 1
169-
- ${FB_DATA}:/data/input:ro
170-
# Read model from Pod 3 output
168+
# Model repository on FlashArray (stress test target)
171169
- ${FA_MODEL_REPO}:/data/models:ro
170+
# Data from FlashBlade (for test inference)
171+
- ${FB_DATA}:/data/input:ro
172172
environment:
173-
- DATA_DIR=/data/input
174173
- MODEL_DIR=/data/models
175-
- TRITON_URL=http://inference:8000
174+
- DATA_DIR=/data/input
176175
- DURATION_SECONDS=${BENCHMARK_DURATION:-60}
177-
- BATCH_SIZE=${BENCHMARK_BATCH_SIZE:-10000}
178176
- NUM_WORKERS=${BENCHMARK_WORKERS:-8}
179-
depends_on:
180-
inference:
181-
condition: service_healthy
177+
- RUN_INFERENCE=${BENCHMARK_INFERENCE:-true}
178+
- INFERENCE_BATCH_SIZE=${BENCHMARK_BATCH_SIZE:-1000}
182179
networks:
183180
- fraud-net
184181

pods/benchmark/Dockerfile

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,26 @@
1-
# Pod 6: Sustained Throughput Benchmark
2-
# Compares CPU (XGBoost) vs GPU (Triton gRPC) inference
1+
# Pod 6: FlashArray Model Reload Stress Test
2+
# Demonstrates low-latency model serving from Pure Storage FlashArray
33

44
FROM python:3.11-slim
55

66
WORKDIR /app
77

8-
# Install dependencies including Triton client for gRPC
8+
# Install dependencies
99
RUN pip install --no-cache-dir \
1010
numpy>=1.26.0 \
1111
pandas>=2.1.0 \
1212
pyarrow>=14.0.0 \
13-
xgboost>=2.0.0 \
14-
requests>=2.31.0 \
15-
tritonclient[grpc]>=2.42.0
13+
xgboost>=2.0.0
1614

1715
COPY benchmark.py .
1816

1917
# Configuration
20-
ENV DATA_DIR=/data/input
2118
ENV MODEL_DIR=/data/models
22-
ENV TRITON_URL=http://inference:8000
19+
ENV DATA_DIR=/data/input
2320
ENV DURATION_SECONDS=60
24-
ENV BATCH_SIZE=10000
2521
ENV NUM_WORKERS=8
22+
ENV RUN_INFERENCE=true
23+
ENV INFERENCE_BATCH_SIZE=1000
2624
ENV PYTHONUNBUFFERED=1
2725

2826
CMD ["python", "benchmark.py"]

0 commit comments

Comments
 (0)