refactoring to include training

ebiser · ebiser · commit bd4a23908df5 · 2025-12-31T12:33:21.000-06:00
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -1,21 +1,16 @@
 services:
-  # Pod 1: Data Gather - Financial Fraud Data Generator
+  # Pod 1: Data Gather - High-throughput transaction generator
   data-gather:
     build: ./pods/data-gather
     container_name: fraud-detection-gather
-    tty: false
-    stdin_open: false
     volumes:
-      - ${TEMPLATE_MOUNT:-/mnt/datasets/kaggle/creditcardfraud}:/mnt/datasets/kaggle/creditcardfraud:ro
-      - ${FB_OUTPUT_MOUNT:-/mnt/fsaai-shared/ebiser/fraud-data}:/mnt/fsaai-shared/ebiser/fraud-data
+      - /mnt/fsaai-shared/ebiser/fraud-data:/mnt/fsaai-shared/ebiser/fraud-data
     environment:
-      - TEMPLATE_DIR=/mnt/datasets/kaggle/creditcardfraud
-      - TEMPLATE_FILE=creditcard.csv
       - OUTPUT_DIR=/mnt/fsaai-shared/ebiser/fraud-data
       - NUM_WORKERS=${NUM_WORKERS:-128}
       - DURATION_SECONDS=${DURATION_SECONDS:-300}
-      - CHUNK_SIZE=${CHUNK_SIZE:-2000000}
-      - OUTPUT_FORMAT=${OUTPUT_FORMAT:-parquet}
+      - CHUNK_SIZE=${CHUNK_SIZE:-1000000}
+      - FRAUD_RATE=${FRAUD_RATE:-0.005}
     deploy:
       resources:
         limits:
@@ -31,15 +26,12 @@ services:
     networks:
       - fraud-detection
 
-  # Pod 2: Data Prep (Multi-GPU) - RAPIDS Dask-cuDF
+  # Pod 2: Data Prep (Multi-GPU) - RAPIDS feature engineering
   data-prep:
     build: ./pods/data-prep
     container_name: fraud-detection-prep
-    tty: false
-    stdin_open: false
     volumes:
-      - ${FB_OUTPUT_MOUNT:-/mnt/fsaai-shared/ebiser/fraud-data}:/mnt/fsaai-shared/ebiser/fraud-data:ro
-      - ${FB_MOUNT:-/mnt/fsaai-shared/ebiser}:/mnt/fsaai-shared/ebiser
+      - /mnt/fsaai-shared/ebiser:/mnt/fsaai-shared/ebiser
     environment:
       - INPUT_DIR=/mnt/fsaai-shared/ebiser/fraud-data
       - OUTPUT_DIR=/mnt/fsaai-shared/ebiser/prep-output
@@ -60,61 +52,56 @@ services:
     networks:
       - fraud-detection
 
-  # Pod 3: Model Build (GPU Required)
+  # Pod 3: Model Build (GPU) - XGBoost training
   model-build:
     build: ./pods/model-build
     container_name: fraud-detection-build
-    tty: false
-    stdin_open: false
     volumes:
-      - ${FB_MOUNT:-/mnt/fsaai-shared/ebiser}:/mnt/fsaai-shared/ebiser
-      - fa-storage:/root/ebiser/nvidia.financial.fraud.detection
+      - /mnt/fsaai-shared/ebiser:/mnt/fsaai-shared/ebiser
+      - model-output:/workspace/model-output
     environment:
       - FB_MOUNT=/mnt/fsaai-shared/ebiser
-      - FA_MOUNT=/root/ebiser/nvidia.financial.fraud.detection
-      - S3_ENDPOINT=${S3_ENDPOINT}
-      - S3_ACCESS_KEY=${S3_ACCESS_KEY}
-      - S3_SECRET_KEY=${S3_SECRET_KEY}
-      - S3_BUCKET=${S3_BUCKET}
-      - FEATURES_FILE=${FEATURES_FILE}
+      - FA_MOUNT=/workspace/model-output
+      - PREP_OUTPUT_DIR=/mnt/fsaai-shared/ebiser/prep-output
+      - S3_ENDPOINT=${S3_ENDPOINT:-}
+      - S3_ACCESS_KEY=${S3_ACCESS_KEY:-}
+      - S3_SECRET_KEY=${S3_SECRET_KEY:-}
+      - S3_BUCKET=${S3_BUCKET:-}
+      - FEATURES_FILE=${FEATURES_FILE:-}
     deploy:
       resources:
         reservations:
           devices:
             - driver: nvidia
-              count: 2
+              count: 1
               capabilities: [gpu]
     depends_on:
-      - data-prep
+      data-prep:
+        condition: service_completed_successfully
     networks:
       - fraud-detection
 
-  # Pod 4: Inference (GPU Required)
+  # Pod 4: Inference - Triton Server
   inference:
-    build: ./pods/inference
+    image: nvcr.io/nvidia/tritonserver:24.02-py3
     container_name: fraud-detection-inference
-    tty: false
-    stdin_open: false
     ports:
       - "8000:8000"
       - "8001:8001"
       - "8002:8002"
     volumes:
-      - fa-storage:/root/ebiser/nvidia.financial.fraud.detection
-    environment:
-      - FA_MOUNT=/root/ebiser/nvidia.financial.fraud.detection
-      - MODEL_REPOSITORY=/root/ebiser/nvidia.financial.fraud.detection/model_repository
-      - NOTIFICATION_ENDPOINT=http://notification:5000/notify/fraud
+      - model-output:/workspace/model-output:ro
+    command: ["tritonserver", "--model-repository=/workspace/model-output/model_repository", "--strict-model-config=false", "--log-verbose=1"]
     deploy:
       resources:
         reservations:
           devices:
             - driver: nvidia
-              count: 2
+              count: 1
               capabilities: [gpu]
     depends_on:
-      - model-build
-      - notification
+      model-build:
+        condition: service_completed_successfully
     networks:
       - fraud-detection
     healthcheck:
@@ -123,12 +110,10 @@ services:
       timeout: 10s
       retries: 3
 
-  # Pod 5: Notification
+  # Pod 5: Notification - Alert service
   notification:
     build: ./pods/notification
     container_name: fraud-detection-notification
-    tty: false
-    stdin_open: false
     ports:
       - "5000:5000"
     environment:
@@ -148,9 +133,5 @@ networks:
     driver: bridge
 
 volumes:
-  fa-storage:
-    driver: local
-    driver_opts:
-      type: none
-      o: bind
-      device: ${FA_MOUNT:-~/ebiser/nvidia.financial.fraud.detection}
+  model-output:
+    driver: local
diff --git a/pods/data-gather/Dockerfile b/pods/data-gather/Dockerfile
@@ -9,10 +9,11 @@ RUN pip install --no-cache-dir -r requirements.txt
 
 COPY gather.py .
 
+# Default configuration
 ENV OUTPUT_DIR=/mnt/fsaai-shared/ebiser/fraud-data
 ENV NUM_WORKERS=128
 ENV DURATION_SECONDS=300
-ENV CHUNK_SIZE=500000
+ENV CHUNK_SIZE=1000000
 ENV FRAUD_RATE=0.005
 ENV PYTHONUNBUFFERED=1
 
diff --git a/pods/data-prep/Dockerfile b/pods/data-prep/Dockerfile
@@ -8,6 +8,7 @@ ENV DASK_DISTRIBUTED__LOGGING__DISTRIBUTED=critical
 WORKDIR /app
 COPY prepare.py /app/
 
+# Default configuration
 ENV INPUT_DIR=/mnt/fsaai-shared/ebiser/fraud-data
 ENV OUTPUT_DIR=/mnt/fsaai-shared/ebiser/prep-output
 ENV BATCH_MODE=false
@@ -17,5 +18,4 @@ ENV LATEST_ONLY=true
 ENV FILE_STABLE_SECONDS=10
 ENV USE_MULTI_GPU=true
 
-# stderr handled in Python
 CMD ["python", "prepare.py"]
diff --git a/pods/model-build/Dockerfile b/pods/model-build/Dockerfile
@@ -1,34 +1,23 @@
-FROM nvcr.io/nvidia/pytorch:23.10-py3
+FROM nvcr.io/nvidia/rapidsai/base:24.02-cuda12.0-py3.10
 
-LABEL maintainer="your.email@example.com"
-LABEL description="Pod 3: Model Build Service for NVIDIA Fraud Detection Pipeline"
-
-# Set working directory
 WORKDIR /app
 
-# Install RAPIDS and XGBoost
+# Install XGBoost with GPU support and boto3
 RUN pip install --no-cache-dir \
-    cudf-cu12==23.10.* \
-    cuml-cu12==23.10.* \
-    cugraph-cu12==23.10.* \
-    xgboost==2.0.2 \
-    torch-geometric==2.4.0 \
-    boto3==1.34.34 \
-    pyarrow==14.0.1
+    xgboost>=2.0.0 \
+    boto3>=1.34.0
 
-# Copy application files
 COPY train.py /app/
 
-# Set environment variables
+# Default configuration
 ENV FB_MOUNT=/mnt/fsaai-shared/ebiser
-ENV FA_MOUNT=/root/ebiser/nvidia.financial.fraud.detection
+ENV FA_MOUNT=/workspace/model-output
+ENV PREP_OUTPUT_DIR=/mnt/fsaai-shared/ebiser/prep-output
 ENV S3_ENDPOINT=""
 ENV S3_ACCESS_KEY=""
 ENV S3_SECRET_KEY=""
 ENV S3_BUCKET=""
 ENV FEATURES_FILE=""
 ENV PYTHONUNBUFFERED=1
-ENV CUDA_VISIBLE_DEVICES=0,1
 
-# Run the application
-CMD ["python", "train.py"]
+CMD ["python", "train.py"]
diff --git a/pods/notification/Dockerfile b/pods/notification/Dockerfile
@@ -1,28 +1,21 @@
 FROM python:3.10-slim
 
-LABEL maintainer="your.email@example.com"
-LABEL description="Pod 5: Notification Service for NVIDIA Fraud Detection Pipeline"
+LABEL description="Pod 5: Notification Service for Fraud Detection Pipeline"
 
-# Set working directory
 WORKDIR /app
 
-# Install Python dependencies
 RUN pip install --no-cache-dir \
     flask==3.0.0 \
     gunicorn==21.2.0 \
     requests==2.31.0
 
-# Copy application files
 COPY app.py /app/
 
-# Set environment variables
 ENV HOST=0.0.0.0
 ENV PORT=5000
 ENV DEBUG=False
 ENV PYTHONUNBUFFERED=1
 
-# Expose port
 EXPOSE 5000
 
-# Run with gunicorn for production
-CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "--timeout", "120", "app:app"]
+CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "--timeout", "120", "app:app"]
diff --git a/pods/notification/app.py b/pods/notification/app.py
@@ -60,12 +60,6 @@ def notify_fraud():
                       f"Score: {alert['fraud_score']:.4f} - "
                       f"Amount: ${alert['amount']:.2f}")
         
-        # In production, you would:
-        # 1. Send email/SMS notifications
-        # 2. Update fraud case management system
-        # 3. Trigger automated blocking if high confidence
-        # 4. Log to security information and event management (SIEM)
-        
         return jsonify({
             'status': 'success',
             'message': 'Alert received and processed',
@@ -157,4 +151,4 @@ def main():
     app.run(host=host, port=port, debug=debug)
 
 if __name__ == "__main__":
-    main()
+    main()