Merge pull request #330 from dataprofessor/dev

shankarpandala · web-flow · commit 13e846d820da · 2021-02-17T04:33:27.000Z
Creating an app folder with the lazypredict app
diff --git a/app/README.md b/app/README.md
@@ -0,0 +1,42 @@
+# ml-auto-app
+
+# Demo
+
+Launch the web app:
+
+[![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://share.streamlit.io/dataprofessor/ml-auto-app/main/app.py)
+
+# Reproducing this web app
+To recreate this web app on your own computer, do the following.
+
+### Create conda environment
+Firstly, we will create a conda environment called *lazypredict*
+```
+conda create -n lazypredict python=3.7.9
+```
+Secondly, we will login to the *lazypredict* environement
+```
+conda activate lazypredict
+```
+### Install prerequisite libraries
+
+Download requirements.txt file
+
+```
+wget https://raw.githubusercontent.com/dataprofessor/ml-auto-app/main/requirements.txt
+
+```
+
+Pip install libraries
+```
+pip install -r requirements.txt
+```
+###  Download and unzip contents from GitHub repo
+
+Download and unzip contents from https://github.com/dataprofessor/ml-auto-app/archive/main.zip
+
+###  Launch the app
+
+```
+streamlit run app.py
+```
diff --git a/app/app.py b/app/app.py
@@ -0,0 +1,180 @@
+# App created by Data Professor http://youtube.com/dataprofessor
+# GitHub repo of this app https://github.com/dataprofessor/ml-auto-app
+# Demo of this app https://share.streamlit.io/dataprofessor/ml-auto-app/main/app.py
+
+import streamlit as st
+import pandas as pd
+from lazypredict.Supervised import LazyRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.datasets import load_diabetes, load_boston
+import matplotlib.pyplot as plt
+import seaborn as sns
+import base64
+import io
+#---------------------------------#
+# Page layout
+## Page expands to full width
+st.set_page_config(page_title='The Machine Learning Algorithm Comparison App',
+    layout='wide')
+#---------------------------------#
+# Model building
+def build_model(df):
+    df = df.loc[:100] # FOR TESTING PURPOSE, COMMENT THIS OUT FOR PRODUCTION
+    X = df.iloc[:,:-1] # Using all column except for the last column as X
+    Y = df.iloc[:,-1] # Selecting the last column as Y
+
+    st.markdown('**1.2. Dataset dimension**')
+    st.write('X')
+    st.info(X.shape)
+    st.write('Y')
+    st.info(Y.shape)
+
+    st.markdown('**1.3. Variable details**:')
+    st.write('X variable (first 20 are shown)')
+    st.info(list(X.columns[:20]))
+    st.write('Y variable')
+    st.info(Y.name)
+
+    # Build lazy model
+    X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size = split_size,random_state = seed_number)
+    reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None)
+    models_train,predictions_train = reg.fit(X_train, X_train, Y_train, Y_train)
+    models_test,predictions_test = reg.fit(X_train, X_test, Y_train, Y_test)
+
+    st.subheader('2. Table of Model Performance')
+
+    st.write('Training set')
+    st.write(predictions_train)
+    st.markdown(filedownload(predictions_train,'training.csv'), unsafe_allow_html=True)
+
+    st.write('Test set')
+    st.write(predictions_test)
+    st.markdown(filedownload(predictions_test,'test.csv'), unsafe_allow_html=True)
+
+    st.subheader('3. Plot of Model Performance (Test set)')
+
+
+    with st.markdown('**R-squared**'):
+        # Tall
+        predictions_test["R-Squared"] = [0 if i < 0 else i for i in predictions_test["R-Squared"] ]
+        plt.figure(figsize=(3, 9))
+        sns.set_theme(style="whitegrid")
+        ax1 = sns.barplot(y=predictions_test.index, x="R-Squared", data=predictions_test)
+        ax1.set(xlim=(0, 1))
+    st.markdown(imagedownload(plt,'plot-r2-tall.pdf'), unsafe_allow_html=True)
+        # Wide
+    plt.figure(figsize=(9, 3))
+    sns.set_theme(style="whitegrid")
+    ax1 = sns.barplot(x=predictions_test.index, y="R-Squared", data=predictions_test)
+    ax1.set(ylim=(0, 1))
+    plt.xticks(rotation=90)
+    st.pyplot(plt)
+    st.markdown(imagedownload(plt,'plot-r2-wide.pdf'), unsafe_allow_html=True)
+
+    with st.markdown('**RMSE (capped at 50)**'):
+        # Tall
+        predictions_test["RMSE"] = [50 if i > 50 else i for i in predictions_test["RMSE"] ]
+        plt.figure(figsize=(3, 9))
+        sns.set_theme(style="whitegrid")
+        ax2 = sns.barplot(y=predictions_test.index, x="RMSE", data=predictions_test)
+    st.markdown(imagedownload(plt,'plot-rmse-tall.pdf'), unsafe_allow_html=True)
+        # Wide
+    plt.figure(figsize=(9, 3))
+    sns.set_theme(style="whitegrid")
+    ax2 = sns.barplot(x=predictions_test.index, y="RMSE", data=predictions_test)
+    plt.xticks(rotation=90)
+    st.pyplot(plt)
+    st.markdown(imagedownload(plt,'plot-rmse-wide.pdf'), unsafe_allow_html=True)
+
+    with st.markdown('**Calculation time**'):
+        # Tall
+        predictions_test["Time Taken"] = [0 if i < 0 else i for i in predictions_test["Time Taken"] ]
+        plt.figure(figsize=(3, 9))
+        sns.set_theme(style="whitegrid")
+        ax3 = sns.barplot(y=predictions_test.index, x="Time Taken", data=predictions_test)
+    st.markdown(imagedownload(plt,'plot-calculation-time-tall.pdf'), unsafe_allow_html=True)
+        # Wide
+    plt.figure(figsize=(9, 3))
+    sns.set_theme(style="whitegrid")
+    ax3 = sns.barplot(x=predictions_test.index, y="Time Taken", data=predictions_test)
+    plt.xticks(rotation=90)
+    st.pyplot(plt)
+    st.markdown(imagedownload(plt,'plot-calculation-time-wide.pdf'), unsafe_allow_html=True)
+
+# Download CSV data
+# https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
+def filedownload(df, filename):
+    csv = df.to_csv(index=False)
+    b64 = base64.b64encode(csv.encode()).decode()  # strings <-> bytes conversions
+    href = f'<a href="data:file/csv;base64,{b64}" download={filename}>Download {filename} File</a>'
+    return href
+
+def imagedownload(plt, filename):
+    s = io.BytesIO()
+    plt.savefig(s, format='pdf', bbox_inches='tight')
+    plt.close()
+    b64 = base64.b64encode(s.getvalue()).decode()  # strings <-> bytes conversions
+    href = f'<a href="data:image/png;base64,{b64}" download={filename}>Download {filename} File</a>'
+    return href
+
+#---------------------------------#
+st.write("""
+# The Machine Learning Algorithm Comparison App
+
+In this implementation, the **lazypredict** library is used for building several machine learning models at once.
+
+Developed by: [Data Professor](http://youtube.com/dataprofessor)
+
+""")
+
+#---------------------------------#
+# Sidebar - Collects user input features into dataframe
+with st.sidebar.header('1. Upload your CSV data'):
+    uploaded_file = st.sidebar.file_uploader("Upload your input CSV file", type=["csv"])
+    st.sidebar.markdown("""
+[Example CSV input file](https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv)
+""")
+
+# Sidebar - Specify parameter settings
+with st.sidebar.header('2. Set Parameters'):
+    split_size = st.sidebar.slider('Data split ratio (% for Training Set)', 10, 90, 80, 5)
+    seed_number = st.sidebar.slider('Set the random seed number', 1, 100, 42, 1)
+
+
+#---------------------------------#
+# Main panel
+
+# Displays the dataset
+st.subheader('1. Dataset')
+
+if uploaded_file is not None:
+    df = pd.read_csv(uploaded_file)
+    st.markdown('**1.1. Glimpse of dataset**')
+    st.write(df)
+    build_model(df)
+else:
+    st.info('Awaiting for CSV file to be uploaded.')
+    if st.button('Press to use Example Dataset'):
+        # Diabetes dataset
+        #diabetes = load_diabetes()
+        #X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
+        #Y = pd.Series(diabetes.target, name='response')
+        #df = pd.concat( [X,Y], axis=1 )
+
+        #st.markdown('The Diabetes dataset is used as the example.')
+        #st.write(df.head(5))
+
+        # Boston housing dataset
+        boston = load_boston()
+        #X = pd.DataFrame(boston.data, columns=boston.feature_names)
+        #Y = pd.Series(boston.target, name='response')
+        X = pd.DataFrame(boston.data, columns=boston.feature_names).loc[:100] # FOR TESTING PURPOSE, COMMENT THIS OUT FOR PRODUCTION
+        Y = pd.Series(boston.target, name='response').loc[:100] # FOR TESTING PURPOSE, COMMENT THIS OUT FOR PRODUCTION
+        df = pd.concat( [X,Y], axis=1 )
+
+        st.markdown('The Boston housing dataset is used as the example.')
+        st.write(df.head(5))
+
+        build_model(df)
diff --git a/app/requirements.txt b/app/requirements.txt
@@ -0,0 +1,14 @@
+streamlit==0.71.0
+pandas==1.1.3
+base58==2.0.1
+numpy==1.19.2
+pillow==8.0.1
+plotly==4.14.1
+scikit-learn==0.23.2
+lazypredict==0.2.7
+seaborn==0.11.1
+matplotlib==3.3.3
+xgboost==1.1.1
+lightgbm==2.3.1
+pytest==5.4.3
+tqdm==4.56.0