diff --git a/Job Satisfaction Analysis/app.py b/Job Satisfaction Analysis/app.py index 5c118a7..19fbd1a 100644 --- a/Job Satisfaction Analysis/app.py +++ b/Job Satisfaction Analysis/app.py @@ -1,9 +1,7 @@ import streamlit as st import joblib import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns -from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc +from sklearn.preprocessing import LabelEncoder # Load the model model = joblib.load('model.pkl') @@ -12,6 +10,26 @@ features = ['Hobby', 'OpenSource', 'Country', 'Student', 'Employment', 'FormalEducation', 'UndergradMajor', 'CompanySize', 'DevType', 'YearsCoding', 'YearsCodingProf'] +# Initialize label encoders for categorical features +encoders = { + 'Hobby': LabelEncoder().fit(['Yes', 'No']), + 'OpenSource': LabelEncoder().fit(['Yes', 'No']), + 'Country': LabelEncoder().fit(['United States', 'India', 'Germany']), + 'Student': LabelEncoder().fit(['Yes', 'No']), + 'Employment': LabelEncoder().fit(['Employed full-time', 'Employed part-time', 'Self-employed', 'Unemployed']), + 'FormalEducation': LabelEncoder().fit(["Bachelor’s degree (BA, BS, B.Eng., etc.)", + "Master’s degree (MA, MS, M.Eng., MBA, etc.)", + "Doctoral degree (PhD)"]), + 'UndergradMajor': LabelEncoder().fit(["Computer science, computer engineering, or software engineering", + "Information technology, networking, or system administration", + "Other engineering discipline"]), + 'CompanySize': LabelEncoder().fit(['Fewer than 10 employees', '10 to 19 employees', '20 to 99 employees', + '100 to 499 employees', '500 to 999 employees', '1,000 to 4,999 employees']), + 'DevType': LabelEncoder().fit(['Developer, back-end', 'Developer, front-end', 'Developer, full-stack']), + 'YearsCoding': LabelEncoder().fit(['0-2 years', '3-5 years', '6-8 years', '9-11 years']), + 'YearsCodingProf': LabelEncoder().fit(['0-2 years', '3-5 years', '6-8 years', '9-11 years']), +} + st.title('Job Satisfaction Prediction') # Create a form for user input @@ -54,6 +72,10 @@ # Convert user input to DataFrame input_df = pd.DataFrame([input_data]) + # Encode categorical features + for feature in features: + input_df[feature] = encoders[feature].transform(input_df[feature]) + # Ensure the input has the same columns as the training data input_df = input_df[features] @@ -62,47 +84,3 @@ # Display the prediction st.write(f'Predicted Job Satisfaction: {prediction[0]}') - - # Evaluate the model on test data (assuming y_test and y_pred are available) - # This part would typically be done during model development, not in the prediction app - # However, for demonstration purposes, we can create some dummy data - y_test = [1, 0, 1, 1, 0] # Example true labels - y_pred = model.predict(input_df) # Example predicted labels - - # Print accuracy - accuracy = accuracy_score(y_test, y_pred) - st.write(f'Accuracy: {accuracy:.2f}') - - # Print classification report - report = classification_report(y_test, y_pred, output_dict=True) - st.write('Classification Report:') - st.write(report) - - # Convert classification report to a DataFrame for better readability - report_df = pd.DataFrame(report).transpose() - st.write(report_df) - - # Plot confusion matrix - cm = confusion_matrix(y_test, y_pred) - plt.figure(figsize=(10, 6)) - sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False) - plt.title('Confusion Matrix') - plt.xlabel('Predicted') - plt.ylabel('Actual') - st.pyplot(plt) - - # If the model is a binary classifier, plot the ROC curve - if len(set(y_test)) == 2: - fpr, tpr, _ = roc_curve(y_test, y_pred) - roc_auc = auc(fpr, tpr) - - plt.figure(figsize=(10, 6)) - plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})') - plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') - plt.xlim([0.0, 1.0]) - plt.ylim([0.0, 1.05]) - plt.xlabel('False Positive Rate') - plt.ylabel('True Positive Rate') - plt.title('Receiver Operating Characteristic (ROC) Curve') - plt.legend(loc='lower right') - st.pyplot(plt)