Skip to content

Commit 0901896

Browse files
authored
Slo prediction experimental (#1677)
* add latency predictor build readme * update test dual server * allow batch prediction * allow batch prediction, update slo headers to all small
1 parent f3bd449 commit 0901896

18 files changed

+2605
-2632
lines changed

latencypredictor-v1/Dockerfile-prediction

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@ WORKDIR /app
66

77
# Copy the requirements file and install dependencies
88
# (It's good practice to manage dependencies in a requirements.txt file)
9+
10+
11+
RUN apt-get update && apt-get install -y \
12+
libgomp1 \
13+
&& rm -rf /var/lib/apt/lists/*
14+
915
COPY requirements.txt .
1016
RUN pip install --no-cache-dir -r requirements.txt
1117

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Dockerfile-test
2+
FROM python:3.9-slim
3+
4+
# Install system dependencies
5+
RUN apt-get update && apt-get install -y \
6+
curl \
7+
wget \
8+
jq \
9+
&& rm -rf /var/lib/apt/lists/*
10+
11+
# Set working directory
12+
WORKDIR /app
13+
14+
# Copy requirements and install Python dependencies
15+
COPY requirements.txt .
16+
RUN pip install --no-cache-dir -r requirements.txt
17+
18+
# Install additional testing dependencies
19+
RUN pip install --no-cache-dir \
20+
pytest \
21+
pytest-asyncio \
22+
requests \
23+
httpx \
24+
aiohttp
25+
26+
# Copy test files
27+
COPY test_dual_server_client.py .
28+
29+
30+
# Create test results directory
31+
RUN mkdir -p /test-results
32+
33+
# Set environment variables
34+
ENV PYTHONPATH=/app
35+
ENV PYTHONUNBUFFERED=1
36+
37+
# Default command runs the specific test
38+
CMD ["pytest", "-v", "-s", "test_dual_server_client.py"]

latencypredictor-v1/Dockerfile-training

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@ WORKDIR /app
66

77
# Copy the requirements file and install dependencies
88
# (It's good practice to manage dependencies in a requirements.txt file)
9+
10+
11+
RUN apt-get update && apt-get install -y \
12+
libgomp1 \
13+
&& rm -rf /var/lib/apt/lists/*
14+
15+
916
COPY requirements.txt .
1017
RUN pip install --no-cache-dir -r requirements.txt
1118

latencypredictor-v1/build-deploy.sh

Lines changed: 191 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
#!/bin/bash
2-
# Build and deploy script for both servers
2+
# Build and deploy script for training, prediction, and test servers
33

44
set -e
55

66
# Configuration
77
PROJECT_ID="kaushikmitra-gke-dev"
88
REGION="asia-southeast1-c"
99
REPOSITORY="kaushikmitra-docker-repo"
10-
TRAINING_IMAGE="latencypredictor-v1-training-server"
11-
PREDICTION_IMAGE="latencypredictor-v1-prediction-server"
10+
TRAINING_IMAGE="latencypredictor-v3-training-server"
11+
PREDICTION_IMAGE="latencypredictor-v3-prediction-server"
12+
TEST_IMAGE="latencypredictor-v3-test"
1213
TAG="latest"
1314

1415
# Colors for output
@@ -41,7 +42,18 @@ check_files() {
4142
fi
4243
done
4344

44-
echo_status "All required files found."
45+
# Check for test-specific files
46+
local test_files=("Dockerfile-test")
47+
for file in "${test_files[@]}"; do
48+
if [[ ! -f "$file" ]]; then
49+
echo_warning "Test file $file not found - test image will not be built"
50+
TEST_BUILD_ENABLED=false
51+
return
52+
fi
53+
done
54+
55+
TEST_BUILD_ENABLED=true
56+
echo_status "All required files found (including test files)."
4557
}
4658

4759
# Build Docker images
@@ -50,7 +62,7 @@ build_images() {
5062

5163
# Build training server image
5264
echo_status "Building training server image..."
53-
docker build -f Dockerfile-training -t ${TRAINING_IMAGE}:${TAG} .
65+
docker build -f Dockerfile-training -t ${TRAINING_IMAGE}:${TAG} .
5466

5567
# Tag for training server
5668
docker tag ${TRAINING_IMAGE}:${TAG} \
@@ -64,7 +76,19 @@ build_images() {
6476
docker tag ${PREDICTION_IMAGE}:${TAG} \
6577
us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${PREDICTION_IMAGE}:${TAG}
6678

67-
echo_status "Images built successfully."
79+
# Build test image if enabled
80+
if [[ "$TEST_BUILD_ENABLED" == "true" ]]; then
81+
echo_status "Building test image..."
82+
docker build -f Dockerfile-test -t ${TEST_IMAGE}:${TAG} .
83+
84+
# Tag for test image
85+
docker tag ${TEST_IMAGE}:${TAG} \
86+
us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${TEST_IMAGE}:${TAG}
87+
88+
echo_status "All images (including test) built successfully."
89+
else
90+
echo_status "Images built successfully (test image skipped)."
91+
fi
6892
}
6993

7094
# Push images to Artifact Registry
@@ -82,7 +106,14 @@ push_images() {
82106
echo_status "Pushing prediction server image..."
83107
docker push us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${PREDICTION_IMAGE}:${TAG}
84108

85-
echo_status "Images pushed successfully."
109+
# Push test image if enabled
110+
if [[ "$TEST_BUILD_ENABLED" == "true" ]]; then
111+
echo_status "Pushing test image..."
112+
docker push us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${TEST_IMAGE}:${TAG}
113+
echo_status "All images (including test) pushed successfully."
114+
else
115+
echo_status "Images pushed successfully (test image skipped)."
116+
fi
86117
}
87118

88119
# Deploy to GKE
@@ -102,6 +133,112 @@ deploy_to_gke() {
102133
echo_status "Deployment completed successfully."
103134
}
104135

136+
# Deploy test job
137+
deploy_test() {
138+
echo_status "Deploying test job..."
139+
140+
if [[ "$TEST_BUILD_ENABLED" != "true" ]]; then
141+
echo_warning "Test image not available. Skipping test deployment."
142+
return
143+
fi
144+
145+
# Check if test manifest exists
146+
if [[ ! -f "test-job.yaml" ]]; then
147+
echo_warning "test-job.yaml not found. Creating a basic test job..."
148+
create_test_manifest
149+
fi
150+
151+
# Delete existing test job if it exists
152+
kubectl delete job latency-predictor-test --ignore-not-found=true
153+
154+
# Apply test job
155+
kubectl apply -f test-job.yaml
156+
157+
echo_status "Test job deployed. Monitor with: kubectl logs -f job/latency-predictor-test"
158+
}
159+
160+
# Create a basic test manifest
161+
create_test_manifest() {
162+
cat > test-job.yaml << EOF
163+
apiVersion: batch/v1
164+
kind: Job
165+
metadata:
166+
name: latency-predictor-test
167+
namespace: default
168+
labels:
169+
app: latency-predictor-test
170+
component: test
171+
spec:
172+
template:
173+
metadata:
174+
labels:
175+
app: latency-predictor-test
176+
component: test
177+
spec:
178+
nodeSelector:
179+
cloud.google.com/gke-nodepool: "pool-2"
180+
restartPolicy: Never
181+
containers:
182+
- name: test-runner
183+
image: us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${TEST_IMAGE}:${TAG}
184+
imagePullPolicy: Always
185+
command: ["pytest"]
186+
args: ["-v", "-s", "test_dual_server_client.py"]
187+
resources:
188+
requests:
189+
cpu: "500m"
190+
memory: "1Gi"
191+
limits:
192+
cpu: "1000m"
193+
memory: "2Gi"
194+
env:
195+
- name: TRAINING_SERVER_URL
196+
value: "http://training-service:8000"
197+
- name: PREDICTION_SERVER_URL
198+
value: "http://prediction-service:80"
199+
- name: TEST_TIMEOUT
200+
value: "300"
201+
volumeMounts:
202+
- name: test-results
203+
mountPath: /test-results
204+
volumes:
205+
- name: test-results
206+
emptyDir: {}
207+
backoffLimit: 3
208+
EOF
209+
echo_status "Created basic test-job.yaml manifest."
210+
}
211+
212+
# Run tests
213+
run_tests() {
214+
echo_status "Running tests..."
215+
216+
if [[ "$TEST_BUILD_ENABLED" != "true" ]]; then
217+
echo_warning "Test image not available. Running basic connectivity tests instead..."
218+
test_deployment
219+
return
220+
fi
221+
222+
# Deploy and run test job
223+
deploy_test
224+
225+
# Wait for job completion and show logs
226+
echo_status "Waiting for test job to complete..."
227+
kubectl wait --for=condition=complete job/latency-predictor-test --timeout=600s || {
228+
echo_error "Test job did not complete successfully"
229+
kubectl describe job latency-predictor-test
230+
kubectl logs job/latency-predictor-test
231+
return 1
232+
}
233+
234+
echo_status "Test job completed. Showing logs:"
235+
kubectl logs job/latency-predictor-test
236+
237+
# Clean up test job
238+
echo_status "Cleaning up test job..."
239+
kubectl delete job latency-predictor-test
240+
}
241+
105242
# Get service information
106243
get_service_info() {
107244
echo_status "Getting service information..."
@@ -131,7 +268,7 @@ get_service_info() {
131268
kubectl get services
132269
}
133270

134-
# Test the deployment
271+
# Test the deployment (basic connectivity tests)
135272
test_deployment() {
136273
echo_status "Testing deployment..."
137274

@@ -165,6 +302,18 @@ test_deployment() {
165302
fi
166303
}
167304

305+
# List built images
306+
list_images() {
307+
echo_status "Listing built images..."
308+
309+
echo_status "Local images:"
310+
docker images | grep -E "${TRAINING_IMAGE}|${PREDICTION_IMAGE}|${TEST_IMAGE}" || echo "No local images found"
311+
312+
echo_status "Remote images in Artifact Registry:"
313+
gcloud artifacts docker images list us-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY} \
314+
--include-tags --filter="package~(${TRAINING_IMAGE}|${PREDICTION_IMAGE}|${TEST_IMAGE})" || echo "No remote images found"
315+
}
316+
168317
# Cleanup function
169318
cleanup() {
170319
echo_status "Cleaning up..."
@@ -184,15 +333,27 @@ main() {
184333
build_images
185334
;;
186335
"push")
336+
check_files
187337
push_images
188338
;;
189339
"deploy")
190340
deploy_to_gke
191341
;;
342+
"test-deploy")
343+
check_files
344+
deploy_test
345+
;;
346+
"test")
347+
check_files
348+
run_tests
349+
;;
192350
"info")
193351
get_service_info
194352
;;
195-
"test")
353+
"images")
354+
list_images
355+
;;
356+
"basic-test")
196357
test_deployment
197358
;;
198359
"all")
@@ -204,17 +365,30 @@ main() {
204365
test_deployment
205366
cleanup
206367
;;
368+
"full")
369+
check_files
370+
build_images
371+
push_images
372+
deploy_to_gke
373+
get_service_info
374+
run_tests
375+
cleanup
376+
;;
207377
*)
208-
echo "Usage: $0 {check|build|push|deploy|info|test|all}"
378+
echo "Usage: $0 {check|build|push|deploy|test-deploy|test|info|images|basic-test|all|full}"
209379
echo ""
210380
echo "Commands:"
211-
echo " check - Check if required files exist"
212-
echo " build - Build Docker images"
213-
echo " push - Push images to Artifact Registry"
214-
echo " deploy - Deploy to GKE"
215-
echo " info - Get service information"
216-
echo " test - Test the deployment"
217-
echo " all - Run complete build and deployment process"
381+
echo " check - Check if required files exist"
382+
echo " build - Build Docker images (including test if Dockerfile-test exists)"
383+
echo " push - Push images to Artifact Registry"
384+
echo " deploy - Deploy to GKE"
385+
echo " test-deploy- Deploy test job only"
386+
echo " test - Run comprehensive tests using test image"
387+
echo " info - Get service information"
388+
echo " images - List built images (local and remote)"
389+
echo " basic-test - Run basic connectivity tests"
390+
echo " all - Run complete build and deployment process (no tests)"
391+
echo " full - Run complete process including comprehensive tests"
218392
exit 1
219393
;;
220394
esac

latencypredictor-v1/manifests/dual-server-deployment.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,11 @@ spec:
8484
component: training
8585
spec:
8686
nodeSelector:
87-
cloud.google.com/gke-nodepool: "pool-1"
87+
cloud.google.com/gke-nodepool: "pool-2"
8888
containers:
8989
- name: training-server
90-
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v1-training-server:latest
91-
90+
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v3-training-server:latest
91+
9292
imagePullPolicy: Always
9393
ports:
9494
- containerPort: 8000
@@ -145,7 +145,7 @@ metadata:
145145
app: prediction-server
146146
component: prediction
147147
spec:
148-
replicas: 5
148+
replicas: 10
149149
selector:
150150
matchLabels:
151151
app: prediction-server
@@ -157,10 +157,10 @@ spec:
157157
component: prediction
158158
spec:
159159
nodeSelector:
160-
cloud.google.com/gke-nodepool: "pool-1"
160+
cloud.google.com/gke-nodepool: "pool-2"
161161
containers:
162162
- name: prediction-server
163-
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v1-prediction-server:latest
163+
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v3-prediction-server:latest
164164
imagePullPolicy: Always
165165
ports:
166166
- containerPort: 8001

0 commit comments

Comments
 (0)