|
| 1 | +# App created by Data Professor http://youtube.com/dataprofessor |
| 2 | +# GitHub repo of this app https://github.com/dataprofessor/ml-auto-app |
| 3 | +# Demo of this app https://share.streamlit.io/dataprofessor/ml-auto-app/main/app.py |
| 4 | + |
| 5 | +import streamlit as st |
| 6 | +import pandas as pd |
| 7 | +from lazypredict.Supervised import LazyRegressor |
| 8 | +from sklearn.model_selection import train_test_split |
| 9 | +from sklearn.ensemble import RandomForestRegressor |
| 10 | +from sklearn.metrics import mean_squared_error, r2_score |
| 11 | +from sklearn.datasets import load_diabetes, load_boston |
| 12 | +import matplotlib.pyplot as plt |
| 13 | +import seaborn as sns |
| 14 | +import base64 |
| 15 | +import io |
| 16 | +#---------------------------------# |
| 17 | +# Page layout |
| 18 | +## Page expands to full width |
| 19 | +st.set_page_config(page_title='The Machine Learning Algorithm Comparison App', |
| 20 | + layout='wide') |
| 21 | +#---------------------------------# |
| 22 | +# Model building |
| 23 | +def build_model(df): |
| 24 | + df = df.loc[:100] # FOR TESTING PURPOSE, COMMENT THIS OUT FOR PRODUCTION |
| 25 | + X = df.iloc[:,:-1] # Using all column except for the last column as X |
| 26 | + Y = df.iloc[:,-1] # Selecting the last column as Y |
| 27 | + |
| 28 | + st.markdown('**1.2. Dataset dimension**') |
| 29 | + st.write('X') |
| 30 | + st.info(X.shape) |
| 31 | + st.write('Y') |
| 32 | + st.info(Y.shape) |
| 33 | + |
| 34 | + st.markdown('**1.3. Variable details**:') |
| 35 | + st.write('X variable (first 20 are shown)') |
| 36 | + st.info(list(X.columns[:20])) |
| 37 | + st.write('Y variable') |
| 38 | + st.info(Y.name) |
| 39 | + |
| 40 | + # Build lazy model |
| 41 | + X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size = split_size,random_state = seed_number) |
| 42 | + reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None) |
| 43 | + models_train,predictions_train = reg.fit(X_train, X_train, Y_train, Y_train) |
| 44 | + models_test,predictions_test = reg.fit(X_train, X_test, Y_train, Y_test) |
| 45 | + |
| 46 | + st.subheader('2. Table of Model Performance') |
| 47 | + |
| 48 | + st.write('Training set') |
| 49 | + st.write(predictions_train) |
| 50 | + st.markdown(filedownload(predictions_train,'training.csv'), unsafe_allow_html=True) |
| 51 | + |
| 52 | + st.write('Test set') |
| 53 | + st.write(predictions_test) |
| 54 | + st.markdown(filedownload(predictions_test,'test.csv'), unsafe_allow_html=True) |
| 55 | + |
| 56 | + st.subheader('3. Plot of Model Performance (Test set)') |
| 57 | + |
| 58 | + |
| 59 | + with st.markdown('**R-squared**'): |
| 60 | + # Tall |
| 61 | + predictions_test["R-Squared"] = [0 if i < 0 else i for i in predictions_test["R-Squared"] ] |
| 62 | + plt.figure(figsize=(3, 9)) |
| 63 | + sns.set_theme(style="whitegrid") |
| 64 | + ax1 = sns.barplot(y=predictions_test.index, x="R-Squared", data=predictions_test) |
| 65 | + ax1.set(xlim=(0, 1)) |
| 66 | + st.markdown(imagedownload(plt,'plot-r2-tall.pdf'), unsafe_allow_html=True) |
| 67 | + # Wide |
| 68 | + plt.figure(figsize=(9, 3)) |
| 69 | + sns.set_theme(style="whitegrid") |
| 70 | + ax1 = sns.barplot(x=predictions_test.index, y="R-Squared", data=predictions_test) |
| 71 | + ax1.set(ylim=(0, 1)) |
| 72 | + plt.xticks(rotation=90) |
| 73 | + st.pyplot(plt) |
| 74 | + st.markdown(imagedownload(plt,'plot-r2-wide.pdf'), unsafe_allow_html=True) |
| 75 | + |
| 76 | + with st.markdown('**RMSE (capped at 50)**'): |
| 77 | + # Tall |
| 78 | + predictions_test["RMSE"] = [50 if i > 50 else i for i in predictions_test["RMSE"] ] |
| 79 | + plt.figure(figsize=(3, 9)) |
| 80 | + sns.set_theme(style="whitegrid") |
| 81 | + ax2 = sns.barplot(y=predictions_test.index, x="RMSE", data=predictions_test) |
| 82 | + st.markdown(imagedownload(plt,'plot-rmse-tall.pdf'), unsafe_allow_html=True) |
| 83 | + # Wide |
| 84 | + plt.figure(figsize=(9, 3)) |
| 85 | + sns.set_theme(style="whitegrid") |
| 86 | + ax2 = sns.barplot(x=predictions_test.index, y="RMSE", data=predictions_test) |
| 87 | + plt.xticks(rotation=90) |
| 88 | + st.pyplot(plt) |
| 89 | + st.markdown(imagedownload(plt,'plot-rmse-wide.pdf'), unsafe_allow_html=True) |
| 90 | + |
| 91 | + with st.markdown('**Calculation time**'): |
| 92 | + # Tall |
| 93 | + predictions_test["Time Taken"] = [0 if i < 0 else i for i in predictions_test["Time Taken"] ] |
| 94 | + plt.figure(figsize=(3, 9)) |
| 95 | + sns.set_theme(style="whitegrid") |
| 96 | + ax3 = sns.barplot(y=predictions_test.index, x="Time Taken", data=predictions_test) |
| 97 | + st.markdown(imagedownload(plt,'plot-calculation-time-tall.pdf'), unsafe_allow_html=True) |
| 98 | + # Wide |
| 99 | + plt.figure(figsize=(9, 3)) |
| 100 | + sns.set_theme(style="whitegrid") |
| 101 | + ax3 = sns.barplot(x=predictions_test.index, y="Time Taken", data=predictions_test) |
| 102 | + plt.xticks(rotation=90) |
| 103 | + st.pyplot(plt) |
| 104 | + st.markdown(imagedownload(plt,'plot-calculation-time-wide.pdf'), unsafe_allow_html=True) |
| 105 | + |
| 106 | +# Download CSV data |
| 107 | +# https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806 |
| 108 | +def filedownload(df, filename): |
| 109 | + csv = df.to_csv(index=False) |
| 110 | + b64 = base64.b64encode(csv.encode()).decode() # strings <-> bytes conversions |
| 111 | + href = f'<a href="data:file/csv;base64,{b64}" download={filename}>Download {filename} File</a>' |
| 112 | + return href |
| 113 | + |
| 114 | +def imagedownload(plt, filename): |
| 115 | + s = io.BytesIO() |
| 116 | + plt.savefig(s, format='pdf', bbox_inches='tight') |
| 117 | + plt.close() |
| 118 | + b64 = base64.b64encode(s.getvalue()).decode() # strings <-> bytes conversions |
| 119 | + href = f'<a href="data:image/png;base64,{b64}" download={filename}>Download {filename} File</a>' |
| 120 | + return href |
| 121 | + |
| 122 | +#---------------------------------# |
| 123 | +st.write(""" |
| 124 | +# The Machine Learning Algorithm Comparison App |
| 125 | +
|
| 126 | +In this implementation, the **lazypredict** library is used for building several machine learning models at once. |
| 127 | +
|
| 128 | +Developed by: [Data Professor](http://youtube.com/dataprofessor) |
| 129 | +
|
| 130 | +""") |
| 131 | + |
| 132 | +#---------------------------------# |
| 133 | +# Sidebar - Collects user input features into dataframe |
| 134 | +with st.sidebar.header('1. Upload your CSV data'): |
| 135 | + uploaded_file = st.sidebar.file_uploader("Upload your input CSV file", type=["csv"]) |
| 136 | + st.sidebar.markdown(""" |
| 137 | +[Example CSV input file](https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv) |
| 138 | +""") |
| 139 | + |
| 140 | +# Sidebar - Specify parameter settings |
| 141 | +with st.sidebar.header('2. Set Parameters'): |
| 142 | + split_size = st.sidebar.slider('Data split ratio (% for Training Set)', 10, 90, 80, 5) |
| 143 | + seed_number = st.sidebar.slider('Set the random seed number', 1, 100, 42, 1) |
| 144 | + |
| 145 | + |
| 146 | +#---------------------------------# |
| 147 | +# Main panel |
| 148 | + |
| 149 | +# Displays the dataset |
| 150 | +st.subheader('1. Dataset') |
| 151 | + |
| 152 | +if uploaded_file is not None: |
| 153 | + df = pd.read_csv(uploaded_file) |
| 154 | + st.markdown('**1.1. Glimpse of dataset**') |
| 155 | + st.write(df) |
| 156 | + build_model(df) |
| 157 | +else: |
| 158 | + st.info('Awaiting for CSV file to be uploaded.') |
| 159 | + if st.button('Press to use Example Dataset'): |
| 160 | + # Diabetes dataset |
| 161 | + #diabetes = load_diabetes() |
| 162 | + #X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names) |
| 163 | + #Y = pd.Series(diabetes.target, name='response') |
| 164 | + #df = pd.concat( [X,Y], axis=1 ) |
| 165 | + |
| 166 | + #st.markdown('The Diabetes dataset is used as the example.') |
| 167 | + #st.write(df.head(5)) |
| 168 | + |
| 169 | + # Boston housing dataset |
| 170 | + boston = load_boston() |
| 171 | + #X = pd.DataFrame(boston.data, columns=boston.feature_names) |
| 172 | + #Y = pd.Series(boston.target, name='response') |
| 173 | + X = pd.DataFrame(boston.data, columns=boston.feature_names).loc[:100] # FOR TESTING PURPOSE, COMMENT THIS OUT FOR PRODUCTION |
| 174 | + Y = pd.Series(boston.target, name='response').loc[:100] # FOR TESTING PURPOSE, COMMENT THIS OUT FOR PRODUCTION |
| 175 | + df = pd.concat( [X,Y], axis=1 ) |
| 176 | + |
| 177 | + st.markdown('The Boston housing dataset is used as the example.') |
| 178 | + st.write(df.head(5)) |
| 179 | + |
| 180 | + build_model(df) |
0 commit comments