Skip to content

Commit bd4a239

Browse files
committed
refactoring to include training
1 parent d16eeae commit bd4a239

File tree

6 files changed

+43
-85
lines changed

6 files changed

+43
-85
lines changed

docker-compose.yaml

Lines changed: 29 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,16 @@
11
services:
2-
# Pod 1: Data Gather - Financial Fraud Data Generator
2+
# Pod 1: Data Gather - High-throughput transaction generator
33
data-gather:
44
build: ./pods/data-gather
55
container_name: fraud-detection-gather
6-
tty: false
7-
stdin_open: false
86
volumes:
9-
- ${TEMPLATE_MOUNT:-/mnt/datasets/kaggle/creditcardfraud}:/mnt/datasets/kaggle/creditcardfraud:ro
10-
- ${FB_OUTPUT_MOUNT:-/mnt/fsaai-shared/ebiser/fraud-data}:/mnt/fsaai-shared/ebiser/fraud-data
7+
- /mnt/fsaai-shared/ebiser/fraud-data:/mnt/fsaai-shared/ebiser/fraud-data
118
environment:
12-
- TEMPLATE_DIR=/mnt/datasets/kaggle/creditcardfraud
13-
- TEMPLATE_FILE=creditcard.csv
149
- OUTPUT_DIR=/mnt/fsaai-shared/ebiser/fraud-data
1510
- NUM_WORKERS=${NUM_WORKERS:-128}
1611
- DURATION_SECONDS=${DURATION_SECONDS:-300}
17-
- CHUNK_SIZE=${CHUNK_SIZE:-2000000}
18-
- OUTPUT_FORMAT=${OUTPUT_FORMAT:-parquet}
12+
- CHUNK_SIZE=${CHUNK_SIZE:-1000000}
13+
- FRAUD_RATE=${FRAUD_RATE:-0.005}
1914
deploy:
2015
resources:
2116
limits:
@@ -31,15 +26,12 @@ services:
3126
networks:
3227
- fraud-detection
3328

34-
# Pod 2: Data Prep (Multi-GPU) - RAPIDS Dask-cuDF
29+
# Pod 2: Data Prep (Multi-GPU) - RAPIDS feature engineering
3530
data-prep:
3631
build: ./pods/data-prep
3732
container_name: fraud-detection-prep
38-
tty: false
39-
stdin_open: false
4033
volumes:
41-
- ${FB_OUTPUT_MOUNT:-/mnt/fsaai-shared/ebiser/fraud-data}:/mnt/fsaai-shared/ebiser/fraud-data:ro
42-
- ${FB_MOUNT:-/mnt/fsaai-shared/ebiser}:/mnt/fsaai-shared/ebiser
34+
- /mnt/fsaai-shared/ebiser:/mnt/fsaai-shared/ebiser
4335
environment:
4436
- INPUT_DIR=/mnt/fsaai-shared/ebiser/fraud-data
4537
- OUTPUT_DIR=/mnt/fsaai-shared/ebiser/prep-output
@@ -60,61 +52,56 @@ services:
6052
networks:
6153
- fraud-detection
6254

63-
# Pod 3: Model Build (GPU Required)
55+
# Pod 3: Model Build (GPU) - XGBoost training
6456
model-build:
6557
build: ./pods/model-build
6658
container_name: fraud-detection-build
67-
tty: false
68-
stdin_open: false
6959
volumes:
70-
- ${FB_MOUNT:-/mnt/fsaai-shared/ebiser}:/mnt/fsaai-shared/ebiser
71-
- fa-storage:/root/ebiser/nvidia.financial.fraud.detection
60+
- /mnt/fsaai-shared/ebiser:/mnt/fsaai-shared/ebiser
61+
- model-output:/workspace/model-output
7262
environment:
7363
- FB_MOUNT=/mnt/fsaai-shared/ebiser
74-
- FA_MOUNT=/root/ebiser/nvidia.financial.fraud.detection
75-
- S3_ENDPOINT=${S3_ENDPOINT}
76-
- S3_ACCESS_KEY=${S3_ACCESS_KEY}
77-
- S3_SECRET_KEY=${S3_SECRET_KEY}
78-
- S3_BUCKET=${S3_BUCKET}
79-
- FEATURES_FILE=${FEATURES_FILE}
64+
- FA_MOUNT=/workspace/model-output
65+
- PREP_OUTPUT_DIR=/mnt/fsaai-shared/ebiser/prep-output
66+
- S3_ENDPOINT=${S3_ENDPOINT:-}
67+
- S3_ACCESS_KEY=${S3_ACCESS_KEY:-}
68+
- S3_SECRET_KEY=${S3_SECRET_KEY:-}
69+
- S3_BUCKET=${S3_BUCKET:-}
70+
- FEATURES_FILE=${FEATURES_FILE:-}
8071
deploy:
8172
resources:
8273
reservations:
8374
devices:
8475
- driver: nvidia
85-
count: 2
76+
count: 1
8677
capabilities: [gpu]
8778
depends_on:
88-
- data-prep
79+
data-prep:
80+
condition: service_completed_successfully
8981
networks:
9082
- fraud-detection
9183

92-
# Pod 4: Inference (GPU Required)
84+
# Pod 4: Inference - Triton Server
9385
inference:
94-
build: ./pods/inference
86+
image: nvcr.io/nvidia/tritonserver:24.02-py3
9587
container_name: fraud-detection-inference
96-
tty: false
97-
stdin_open: false
9888
ports:
9989
- "8000:8000"
10090
- "8001:8001"
10191
- "8002:8002"
10292
volumes:
103-
- fa-storage:/root/ebiser/nvidia.financial.fraud.detection
104-
environment:
105-
- FA_MOUNT=/root/ebiser/nvidia.financial.fraud.detection
106-
- MODEL_REPOSITORY=/root/ebiser/nvidia.financial.fraud.detection/model_repository
107-
- NOTIFICATION_ENDPOINT=http://notification:5000/notify/fraud
93+
- model-output:/workspace/model-output:ro
94+
command: ["tritonserver", "--model-repository=/workspace/model-output/model_repository", "--strict-model-config=false", "--log-verbose=1"]
10895
deploy:
10996
resources:
11097
reservations:
11198
devices:
11299
- driver: nvidia
113-
count: 2
100+
count: 1
114101
capabilities: [gpu]
115102
depends_on:
116-
- model-build
117-
- notification
103+
model-build:
104+
condition: service_completed_successfully
118105
networks:
119106
- fraud-detection
120107
healthcheck:
@@ -123,12 +110,10 @@ services:
123110
timeout: 10s
124111
retries: 3
125112

126-
# Pod 5: Notification
113+
# Pod 5: Notification - Alert service
127114
notification:
128115
build: ./pods/notification
129116
container_name: fraud-detection-notification
130-
tty: false
131-
stdin_open: false
132117
ports:
133118
- "5000:5000"
134119
environment:
@@ -148,9 +133,5 @@ networks:
148133
driver: bridge
149134

150135
volumes:
151-
fa-storage:
152-
driver: local
153-
driver_opts:
154-
type: none
155-
o: bind
156-
device: ${FA_MOUNT:-~/ebiser/nvidia.financial.fraud.detection}
136+
model-output:
137+
driver: local

pods/data-gather/Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@ RUN pip install --no-cache-dir -r requirements.txt
99

1010
COPY gather.py .
1111

12+
# Default configuration
1213
ENV OUTPUT_DIR=/mnt/fsaai-shared/ebiser/fraud-data
1314
ENV NUM_WORKERS=128
1415
ENV DURATION_SECONDS=300
15-
ENV CHUNK_SIZE=500000
16+
ENV CHUNK_SIZE=1000000
1617
ENV FRAUD_RATE=0.005
1718
ENV PYTHONUNBUFFERED=1
1819

pods/data-prep/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ ENV DASK_DISTRIBUTED__LOGGING__DISTRIBUTED=critical
88
WORKDIR /app
99
COPY prepare.py /app/
1010

11+
# Default configuration
1112
ENV INPUT_DIR=/mnt/fsaai-shared/ebiser/fraud-data
1213
ENV OUTPUT_DIR=/mnt/fsaai-shared/ebiser/prep-output
1314
ENV BATCH_MODE=false
@@ -17,5 +18,4 @@ ENV LATEST_ONLY=true
1718
ENV FILE_STABLE_SECONDS=10
1819
ENV USE_MULTI_GPU=true
1920

20-
# stderr handled in Python
2121
CMD ["python", "prepare.py"]

pods/model-build/Dockerfile

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,23 @@
1-
FROM nvcr.io/nvidia/pytorch:23.10-py3
1+
FROM nvcr.io/nvidia/rapidsai/base:24.02-cuda12.0-py3.10
22

3-
LABEL maintainer="your.email@example.com"
4-
LABEL description="Pod 3: Model Build Service for NVIDIA Fraud Detection Pipeline"
5-
6-
# Set working directory
73
WORKDIR /app
84

9-
# Install RAPIDS and XGBoost
5+
# Install XGBoost with GPU support and boto3
106
RUN pip install --no-cache-dir \
11-
cudf-cu12==23.10.* \
12-
cuml-cu12==23.10.* \
13-
cugraph-cu12==23.10.* \
14-
xgboost==2.0.2 \
15-
torch-geometric==2.4.0 \
16-
boto3==1.34.34 \
17-
pyarrow==14.0.1
7+
xgboost>=2.0.0 \
8+
boto3>=1.34.0
189

19-
# Copy application files
2010
COPY train.py /app/
2111

22-
# Set environment variables
12+
# Default configuration
2313
ENV FB_MOUNT=/mnt/fsaai-shared/ebiser
24-
ENV FA_MOUNT=/root/ebiser/nvidia.financial.fraud.detection
14+
ENV FA_MOUNT=/workspace/model-output
15+
ENV PREP_OUTPUT_DIR=/mnt/fsaai-shared/ebiser/prep-output
2516
ENV S3_ENDPOINT=""
2617
ENV S3_ACCESS_KEY=""
2718
ENV S3_SECRET_KEY=""
2819
ENV S3_BUCKET=""
2920
ENV FEATURES_FILE=""
3021
ENV PYTHONUNBUFFERED=1
31-
ENV CUDA_VISIBLE_DEVICES=0,1
3222

33-
# Run the application
34-
CMD ["python", "train.py"]
23+
CMD ["python", "train.py"]

pods/notification/Dockerfile

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,21 @@
11
FROM python:3.10-slim
22

3-
LABEL maintainer="your.email@example.com"
4-
LABEL description="Pod 5: Notification Service for NVIDIA Fraud Detection Pipeline"
3+
LABEL description="Pod 5: Notification Service for Fraud Detection Pipeline"
54

6-
# Set working directory
75
WORKDIR /app
86

9-
# Install Python dependencies
107
RUN pip install --no-cache-dir \
118
flask==3.0.0 \
129
gunicorn==21.2.0 \
1310
requests==2.31.0
1411

15-
# Copy application files
1612
COPY app.py /app/
1713

18-
# Set environment variables
1914
ENV HOST=0.0.0.0
2015
ENV PORT=5000
2116
ENV DEBUG=False
2217
ENV PYTHONUNBUFFERED=1
2318

24-
# Expose port
2519
EXPOSE 5000
2620

27-
# Run with gunicorn for production
28-
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "--timeout", "120", "app:app"]
21+
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "--timeout", "120", "app:app"]

pods/notification/app.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,6 @@ def notify_fraud():
6060
f"Score: {alert['fraud_score']:.4f} - "
6161
f"Amount: ${alert['amount']:.2f}")
6262

63-
# In production, you would:
64-
# 1. Send email/SMS notifications
65-
# 2. Update fraud case management system
66-
# 3. Trigger automated blocking if high confidence
67-
# 4. Log to security information and event management (SIEM)
68-
6963
return jsonify({
7064
'status': 'success',
7165
'message': 'Alert received and processed',
@@ -157,4 +151,4 @@ def main():
157151
app.run(host=host, port=port, debug=debug)
158152

159153
if __name__ == "__main__":
160-
main()
154+
main()

0 commit comments

Comments
 (0)