MLOps_PipeLine/production_app.py at main · Abeshith/MLOps_PipeLine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
from flask import Flask, render_template, request, jsonify
import pandas as pd
import numpy as np
import json
import os
import pickle
import joblib
import time
import logging
import requests
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

# Prometheus metrics
from prometheus_client import Counter, Histogram, Gauge, Info, generate_latest, CONTENT_TYPE_LATEST

# Logging setup for Elasticsearch
import logging.handlers
import sys

app = Flask(__name__)

# Configure structured logging for ELK stack
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler('/tmp/ml-pipeline.log')
    ]
)
logger = logging.getLogger('ml-pipeline')

# Prometheus Metrics - Complete ML Pipeline Monitoring
prediction_counter = Counter('ml_predictions_total', 'Total predictions made')
model_accuracy = Gauge('ml_model_accuracy', 'Current model accuracy')
model_precision = Gauge('ml_model_precision', 'Model precision')
model_recall = Gauge('ml_model_recall', 'Model recall')
model_f1_score = Gauge('ml_model_f1_score', 'Model F1 score')
request_duration = Histogram('http_request_duration_seconds', 'Request duration')
prediction_confidence = Histogram('ml_prediction_confidence', 'Prediction confidence scores')
predictions_by_class = Counter('ml_predictions_by_class', 'Predictions by class', ['prediction_class'])
error_rate = Gauge('ml_prediction_error_rate', 'Prediction error rate')
model_version = Info('ml_model_version', 'Current model version')
feature_processing_time = Histogram('ml_feature_processing_seconds', 'Feature processing time')
input_validation_failures = Counter('ml_input_validation_failures', 'Invalid input requests')
model_load_time = Histogram('ml_model_load_seconds', 'Model loading time')
memory_usage = Gauge('ml_app_memory_bytes', 'Memory usage')
active_users = Gauge('ml_active_users', 'Active users')

# Initialize metrics with actual model performance
model_accuracy.set(0.9215)
model_precision.set(0.9147)
model_recall.set(0.9200)
model_f1_score.set(0.9164)
error_rate.set(0.0785)
model_version.info({'version': 'xgboost_v1.0', 'training_date': '2024-10-17', 'accuracy': '92.15%'})

# Global counters for monitoring
total_requests = 0
failed_requests = 0
model_loaded = False

def send_log_to_elasticsearch(level, message, extra_data=None):
    """Send structured logs to Elasticsearch"""
    try:
        log_data = {
            '@timestamp': datetime.utcnow().isoformat() + 'Z',
            'level': level,
            'service': 'ml-pipeline',
            'message': message,
            'host': 'localhost',
            'environment': 'production'
        }
        if extra_data:
            log_data.update(extra_data)

        # Send to Elasticsearch
        requests.post(
            'http://localhost:9200/ml-pipeline-logs/_doc',
            json=log_data,
            timeout=1
        )
    except:
        pass  # Don't fail if logging fails

def load_model_with_monitoring():
    """Load model with performance monitoring"""
    global model_loaded
    start_time = time.time()

    try:
        logger.info("Starting model loading process")
        send_log_to_elasticsearch('INFO', 'Model loading started')

        if not os.path.exists('artifacts/model_trainer/model.pkl'):
            logger.error("Model file not found")
            send_log_to_elasticsearch('ERROR', 'Model file not found', {'error_code': 'MODEL_NOT_FOUND'})
            return None

        model = joblib.load('artifacts/model_trainer/model.pkl')

        # Fix XGBoost compatibility
        if hasattr(model, 'use_label_encoder'):
            model.use_label_encoder = False

        load_duration = time.time() - start_time
        model_load_time.observe(load_duration)
        model_loaded = True

        logger.info(f"Model loaded successfully in {load_duration:.3f}s")
        send_log_to_elasticsearch('INFO', 'Model loaded successfully', {
            'model_type': model.__class__.__name__,
            'load_duration': load_duration
        })

        return model
    except Exception as e:
        load_duration = time.time() - start_time
        model_load_time.observe(load_duration)
        logger.error(f"Model loading failed: {str(e)}")
        send_log_to_elasticsearch('ERROR', 'Model loading failed', {
            'error': str(e),
            'load_duration': load_duration
        })
        return None

# Load model at startup
model = load_model_with_monitoring()

@app.route('/')
def index():
    """Main UI page"""
    active_users.inc()
    logger.info("Home page accessed")
    send_log_to_elasticsearch('INFO', 'Home page accessed', {'endpoint': 'home'})
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    """ML Prediction endpoint with comprehensive monitoring"""
    global total_requests, failed_requests
    start_time = time.time()
    feature_start = time.time()

    total_requests += 1
    request_id = f"req_{int(time.time())}_{total_requests}"

    logger.info(f"Prediction request started: {request_id}")
    send_log_to_elasticsearch('INFO', 'Prediction request started', {
        'request_id': request_id,
        'endpoint': 'predict'
    })

    try:
        if not model_loaded or model is None:
            failed_requests += 1
            input_validation_failures.inc()
            error_rate.set(failed_requests / total_requests)

            logger.error("Model not available")
            send_log_to_elasticsearch('ERROR', 'Model not available', {
                'request_id': request_id,
                'error_code': 'MODEL_NOT_AVAILABLE'
            })
            return jsonify({'error': 'Model not available. Please check model loading.'}), 500

        # Get and validate input data
        data = request.json
        if not data:
            failed_requests += 1
            input_validation_failures.inc()
            error_rate.set(failed_requests / total_requests)

            logger.error("No input data provided")
            send_log_to_elasticsearch('ERROR', 'No input data provided', {
                'request_id': request_id,
                'error_code': 'NO_INPUT_DATA'
            })
            return jsonify({'error': 'No input data provided'}), 400

        # Validate required fields
        required_fields = ['age', 'job', 'marital', 'education', 'housing', 'loan', 'duration', 'campaign']
        missing_fields = [field for field in required_fields if field not in data or data[field] == '']

        if missing_fields:
            failed_requests += 1
            input_validation_failures.inc()
            error_rate.set(failed_requests / total_requests)

            logger.error(f"Missing required fields: {missing_fields}")
            send_log_to_elasticsearch('ERROR', 'Missing required fields', {
                'request_id': request_id,
                'missing_fields': missing_fields,
                'error_code': 'MISSING_FIELDS'
            })
            return jsonify({'error': f'Missing required fields: {missing_fields}'}), 400

        # Feature processing with monitoring
        try:
            # Convert and validate numeric fields
            data['age'] = int(data['age'])
            data['duration'] = int(data['duration'])
            data['campaign'] = int(data['campaign'])
            data['balance'] = int(data.get('balance', 1500))
            data['day'] = 15
            data['pdays'] = -1
            data['previous'] = 0
        except ValueError as e:
            failed_requests += 1
            input_validation_failures.inc()
            error_rate.set(failed_requests / total_requests)

            logger.error(f"Invalid numeric values: {str(e)}")
            send_log_to_elasticsearch('ERROR', 'Invalid numeric values', {
                'request_id': request_id,
                'error': str(e),
                'error_code': 'INVALID_NUMERIC_VALUES'
            })
            return jsonify({'error': 'Invalid numeric values provided'}), 400

        # Add default values
        data.update({
            'id': 999999,
            'default': 'no',
            'contact': 'cellular',
            'month': 'may',
            'poutcome': 'unknown'
        })

        feature_duration = time.time() - feature_start
        feature_processing_time.observe(feature_duration)

        # Create feature vector (simplified for demo)
        try:
            # Load sample transformed data structure
            if os.path.exists('artifacts/data_transformation/train.csv'):
                transformed_sample = pd.read_csv('artifacts/data_transformation/train.csv', nrows=1)
                feature_names = [col for col in transformed_sample.columns if col != 'y']

                # Create feature vector
                feature_vector = pd.DataFrame(0, index=[0], columns=feature_names)

                # Set normalized features (simplified)
                feature_vector.loc[0, 'age'] = (data['age'] - 40) / 10
                feature_vector.loc[0, 'balance'] = (data['balance'] - 1000) / 2000
                feature_vector.loc[0, 'duration'] = (data['duration'] - 250) / 200
                feature_vector.loc[0, 'campaign'] = (data['campaign'] - 2) / 2

                # Set categorical features
                if data['housing'] == 'yes':
                    feature_vector.loc[0, 'housing_yes'] = 1

                processed_data = feature_vector.values
            else:
                # Fallback: create simple feature vector
                processed_data = np.array([[
                    data['age'], data['balance'], data['duration'],
                    data['campaign'], 1 if data['housing'] == 'yes' else 0
                ]])

            # Make prediction
            prediction = model.predict(processed_data)

            # Get prediction probability
            try:
                prediction_proba = model.predict_proba(processed_data)
                if len(prediction_proba.shape) > 1 and prediction_proba.shape[1] > 1:
                    prob_positive = float(prediction_proba[0][1])
                else:
                    prob_positive = float(prediction_proba[0])
            except:
                prob_positive = 0.5  # Fallback

            confidence = max(prob_positive, 1 - prob_positive)

            # Update metrics
            prediction_counter.inc()
            prediction_confidence.observe(confidence)
            predictions_by_class.labels(prediction_class=str(prediction[0])).inc()

            duration = time.time() - start_time
            request_duration.observe(duration)
            error_rate.set(failed_requests / total_requests)

            # Log successful prediction
            logger.info(f"Prediction completed successfully: {request_id}")
            send_log_to_elasticsearch('INFO', 'Prediction completed successfully', {
                'request_id': request_id,
                'prediction': int(prediction[0]),
                'confidence': float(confidence),
                'duration': duration,
                'feature_processing_time': feature_duration,
                'model_type': model.__class__.__name__
            })

            return jsonify({
                'prediction': int(prediction[0]),
                'probability': prob_positive,
                'confidence': float(confidence),
                'confidence_level': 'High' if confidence > 0.8 else 'Medium' if confidence > 0.6 else 'Low',
                'result': 'Will Subscribe to Term Deposit' if prediction[0] == 1 else 'Will Not Subscribe to Term Deposit',
                'duration': round(duration, 3),
                'request_id': request_id,
                'timestamp': datetime.now().isoformat()
            })

        except Exception as e:
            failed_requests += 1
            duration = time.time() - start_time
            request_duration.observe(duration)
            error_rate.set(failed_requests / total_requests)

            logger.error(f"Prediction processing failed: {str(e)}")
            send_log_to_elasticsearch('ERROR', 'Prediction processing failed', {
                'request_id': request_id,
                'error': str(e),
                'duration': duration,
                'error_code': 'PREDICTION_PROCESSING_ERROR'
            })
            return jsonify({'error': f'Prediction processing failed: {str(e)}'}), 500

    except Exception as e:
        failed_requests += 1
        duration = time.time() - start_time
        request_duration.observe(duration)
        error_rate.set(failed_requests / total_requests)

        logger.error(f"Unexpected error in prediction: {str(e)}")
        send_log_to_elasticsearch('ERROR', 'Unexpected prediction error', {
            'request_id': request_id,
            'error': str(e),
            'duration': duration,
            'error_code': 'UNEXPECTED_ERROR'
        })
        return jsonify({'error': f'Unexpected error: {str(e)}'}), 500

@app.route('/health')
def health():
    """Comprehensive health check endpoint"""
    health_status = {
        'status': 'healthy' if model_loaded else 'degraded',
        'service': 'ml-pipeline-production',
        'timestamp': datetime.now().isoformat(),
        'model_loaded': model_loaded,
        'total_predictions': total_requests,
        'error_rate': round(failed_requests / total_requests if total_requests > 0 else 0, 3),
        'model_accuracy': 0.9215,
        'version': '1.0.0',
        'uptime': time.time()
    }

    logger.info("Health check performed")
    send_log_to_elasticsearch('INFO', 'Health check performed', health_status)

    return jsonify(health_status)

@app.route('/metrics')
def metrics():
    """Prometheus metrics endpoint"""
    # Update memory usage
    try:
        import psutil
        process = psutil.Process()
        memory_usage.set(process.memory_info().rss)
    except:
        pass

    return generate_latest(), 200, {'Content-Type': CONTENT_TYPE_LATEST}

@app.route('/status')
def status():
    """Detailed system status for monitoring"""
    status_info = {
        'service': 'ml-pipeline-production',
        'version': '1.0.0',
        'model': {
            'loaded': model_loaded,
            'type': model.__class__.__name__ if model else None,
            'accuracy': 0.9215,
            'version': 'xgboost_v1.0'
        },
        'metrics': {
            'total_predictions': total_requests,
            'failed_predictions': failed_requests,
            'error_rate': round(failed_requests / total_requests if total_requests > 0 else 0, 3),
            'success_rate': round((total_requests - failed_requests) / total_requests if total_requests > 0 else 1, 3)
        },
        'observability': {
            'prometheus_enabled': True,
            'elasticsearch_logging': True,
            'structured_logging': True,
            'health_monitoring': True
        },
        'timestamp': datetime.now().isoformat()
    }

    return jsonify(status_info)

if __name__ == '__main__':
    logger.info("Starting ML Pipeline Production Application")
    send_log_to_elasticsearch('INFO', 'Application starting', {
        'version': '1.0.0',
        'model_loaded': model_loaded,
        'observability_enabled': True
    })

    app.run(host='0.0.0.0', port=5000, debug=False)