Skip to content

Commit 13e846d

Browse files
Merge pull request #330 from dataprofessor/dev
Creating an app folder with the lazypredict app
2 parents 4b65cd9 + 563a498 commit 13e846d

File tree

3 files changed

+236
-0
lines changed

3 files changed

+236
-0
lines changed

app/README.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# ml-auto-app
2+
3+
# Demo
4+
5+
Launch the web app:
6+
7+
[![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://share.streamlit.io/dataprofessor/ml-auto-app/main/app.py)
8+
9+
# Reproducing this web app
10+
To recreate this web app on your own computer, do the following.
11+
12+
### Create conda environment
13+
Firstly, we will create a conda environment called *lazypredict*
14+
```
15+
conda create -n lazypredict python=3.7.9
16+
```
17+
Secondly, we will login to the *lazypredict* environement
18+
```
19+
conda activate lazypredict
20+
```
21+
### Install prerequisite libraries
22+
23+
Download requirements.txt file
24+
25+
```
26+
wget https://raw.githubusercontent.com/dataprofessor/ml-auto-app/main/requirements.txt
27+
28+
```
29+
30+
Pip install libraries
31+
```
32+
pip install -r requirements.txt
33+
```
34+
### Download and unzip contents from GitHub repo
35+
36+
Download and unzip contents from https://github.com/dataprofessor/ml-auto-app/archive/main.zip
37+
38+
### Launch the app
39+
40+
```
41+
streamlit run app.py
42+
```

app/app.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
# App created by Data Professor http://youtube.com/dataprofessor
2+
# GitHub repo of this app https://github.com/dataprofessor/ml-auto-app
3+
# Demo of this app https://share.streamlit.io/dataprofessor/ml-auto-app/main/app.py
4+
5+
import streamlit as st
6+
import pandas as pd
7+
from lazypredict.Supervised import LazyRegressor
8+
from sklearn.model_selection import train_test_split
9+
from sklearn.ensemble import RandomForestRegressor
10+
from sklearn.metrics import mean_squared_error, r2_score
11+
from sklearn.datasets import load_diabetes, load_boston
12+
import matplotlib.pyplot as plt
13+
import seaborn as sns
14+
import base64
15+
import io
16+
#---------------------------------#
17+
# Page layout
18+
## Page expands to full width
19+
st.set_page_config(page_title='The Machine Learning Algorithm Comparison App',
20+
layout='wide')
21+
#---------------------------------#
22+
# Model building
23+
def build_model(df):
24+
df = df.loc[:100] # FOR TESTING PURPOSE, COMMENT THIS OUT FOR PRODUCTION
25+
X = df.iloc[:,:-1] # Using all column except for the last column as X
26+
Y = df.iloc[:,-1] # Selecting the last column as Y
27+
28+
st.markdown('**1.2. Dataset dimension**')
29+
st.write('X')
30+
st.info(X.shape)
31+
st.write('Y')
32+
st.info(Y.shape)
33+
34+
st.markdown('**1.3. Variable details**:')
35+
st.write('X variable (first 20 are shown)')
36+
st.info(list(X.columns[:20]))
37+
st.write('Y variable')
38+
st.info(Y.name)
39+
40+
# Build lazy model
41+
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size = split_size,random_state = seed_number)
42+
reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None)
43+
models_train,predictions_train = reg.fit(X_train, X_train, Y_train, Y_train)
44+
models_test,predictions_test = reg.fit(X_train, X_test, Y_train, Y_test)
45+
46+
st.subheader('2. Table of Model Performance')
47+
48+
st.write('Training set')
49+
st.write(predictions_train)
50+
st.markdown(filedownload(predictions_train,'training.csv'), unsafe_allow_html=True)
51+
52+
st.write('Test set')
53+
st.write(predictions_test)
54+
st.markdown(filedownload(predictions_test,'test.csv'), unsafe_allow_html=True)
55+
56+
st.subheader('3. Plot of Model Performance (Test set)')
57+
58+
59+
with st.markdown('**R-squared**'):
60+
# Tall
61+
predictions_test["R-Squared"] = [0 if i < 0 else i for i in predictions_test["R-Squared"] ]
62+
plt.figure(figsize=(3, 9))
63+
sns.set_theme(style="whitegrid")
64+
ax1 = sns.barplot(y=predictions_test.index, x="R-Squared", data=predictions_test)
65+
ax1.set(xlim=(0, 1))
66+
st.markdown(imagedownload(plt,'plot-r2-tall.pdf'), unsafe_allow_html=True)
67+
# Wide
68+
plt.figure(figsize=(9, 3))
69+
sns.set_theme(style="whitegrid")
70+
ax1 = sns.barplot(x=predictions_test.index, y="R-Squared", data=predictions_test)
71+
ax1.set(ylim=(0, 1))
72+
plt.xticks(rotation=90)
73+
st.pyplot(plt)
74+
st.markdown(imagedownload(plt,'plot-r2-wide.pdf'), unsafe_allow_html=True)
75+
76+
with st.markdown('**RMSE (capped at 50)**'):
77+
# Tall
78+
predictions_test["RMSE"] = [50 if i > 50 else i for i in predictions_test["RMSE"] ]
79+
plt.figure(figsize=(3, 9))
80+
sns.set_theme(style="whitegrid")
81+
ax2 = sns.barplot(y=predictions_test.index, x="RMSE", data=predictions_test)
82+
st.markdown(imagedownload(plt,'plot-rmse-tall.pdf'), unsafe_allow_html=True)
83+
# Wide
84+
plt.figure(figsize=(9, 3))
85+
sns.set_theme(style="whitegrid")
86+
ax2 = sns.barplot(x=predictions_test.index, y="RMSE", data=predictions_test)
87+
plt.xticks(rotation=90)
88+
st.pyplot(plt)
89+
st.markdown(imagedownload(plt,'plot-rmse-wide.pdf'), unsafe_allow_html=True)
90+
91+
with st.markdown('**Calculation time**'):
92+
# Tall
93+
predictions_test["Time Taken"] = [0 if i < 0 else i for i in predictions_test["Time Taken"] ]
94+
plt.figure(figsize=(3, 9))
95+
sns.set_theme(style="whitegrid")
96+
ax3 = sns.barplot(y=predictions_test.index, x="Time Taken", data=predictions_test)
97+
st.markdown(imagedownload(plt,'plot-calculation-time-tall.pdf'), unsafe_allow_html=True)
98+
# Wide
99+
plt.figure(figsize=(9, 3))
100+
sns.set_theme(style="whitegrid")
101+
ax3 = sns.barplot(x=predictions_test.index, y="Time Taken", data=predictions_test)
102+
plt.xticks(rotation=90)
103+
st.pyplot(plt)
104+
st.markdown(imagedownload(plt,'plot-calculation-time-wide.pdf'), unsafe_allow_html=True)
105+
106+
# Download CSV data
107+
# https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
108+
def filedownload(df, filename):
109+
csv = df.to_csv(index=False)
110+
b64 = base64.b64encode(csv.encode()).decode() # strings <-> bytes conversions
111+
href = f'<a href="data:file/csv;base64,{b64}" download={filename}>Download {filename} File</a>'
112+
return href
113+
114+
def imagedownload(plt, filename):
115+
s = io.BytesIO()
116+
plt.savefig(s, format='pdf', bbox_inches='tight')
117+
plt.close()
118+
b64 = base64.b64encode(s.getvalue()).decode() # strings <-> bytes conversions
119+
href = f'<a href="data:image/png;base64,{b64}" download={filename}>Download {filename} File</a>'
120+
return href
121+
122+
#---------------------------------#
123+
st.write("""
124+
# The Machine Learning Algorithm Comparison App
125+
126+
In this implementation, the **lazypredict** library is used for building several machine learning models at once.
127+
128+
Developed by: [Data Professor](http://youtube.com/dataprofessor)
129+
130+
""")
131+
132+
#---------------------------------#
133+
# Sidebar - Collects user input features into dataframe
134+
with st.sidebar.header('1. Upload your CSV data'):
135+
uploaded_file = st.sidebar.file_uploader("Upload your input CSV file", type=["csv"])
136+
st.sidebar.markdown("""
137+
[Example CSV input file](https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv)
138+
""")
139+
140+
# Sidebar - Specify parameter settings
141+
with st.sidebar.header('2. Set Parameters'):
142+
split_size = st.sidebar.slider('Data split ratio (% for Training Set)', 10, 90, 80, 5)
143+
seed_number = st.sidebar.slider('Set the random seed number', 1, 100, 42, 1)
144+
145+
146+
#---------------------------------#
147+
# Main panel
148+
149+
# Displays the dataset
150+
st.subheader('1. Dataset')
151+
152+
if uploaded_file is not None:
153+
df = pd.read_csv(uploaded_file)
154+
st.markdown('**1.1. Glimpse of dataset**')
155+
st.write(df)
156+
build_model(df)
157+
else:
158+
st.info('Awaiting for CSV file to be uploaded.')
159+
if st.button('Press to use Example Dataset'):
160+
# Diabetes dataset
161+
#diabetes = load_diabetes()
162+
#X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
163+
#Y = pd.Series(diabetes.target, name='response')
164+
#df = pd.concat( [X,Y], axis=1 )
165+
166+
#st.markdown('The Diabetes dataset is used as the example.')
167+
#st.write(df.head(5))
168+
169+
# Boston housing dataset
170+
boston = load_boston()
171+
#X = pd.DataFrame(boston.data, columns=boston.feature_names)
172+
#Y = pd.Series(boston.target, name='response')
173+
X = pd.DataFrame(boston.data, columns=boston.feature_names).loc[:100] # FOR TESTING PURPOSE, COMMENT THIS OUT FOR PRODUCTION
174+
Y = pd.Series(boston.target, name='response').loc[:100] # FOR TESTING PURPOSE, COMMENT THIS OUT FOR PRODUCTION
175+
df = pd.concat( [X,Y], axis=1 )
176+
177+
st.markdown('The Boston housing dataset is used as the example.')
178+
st.write(df.head(5))
179+
180+
build_model(df)

app/requirements.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
streamlit==0.71.0
2+
pandas==1.1.3
3+
base58==2.0.1
4+
numpy==1.19.2
5+
pillow==8.0.1
6+
plotly==4.14.1
7+
scikit-learn==0.23.2
8+
lazypredict==0.2.7
9+
seaborn==0.11.1
10+
matplotlib==3.3.3
11+
xgboost==1.1.1
12+
lightgbm==2.3.1
13+
pytest==5.4.3
14+
tqdm==4.56.0

0 commit comments

Comments
 (0)