Machine_Learning_course_IBM/generate_notebooks.py at main · dhruvhaldar/Machine_Learning_course_IBM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
import os
import nbformat as nbf
from nbconvert.preprocessors import ExecutePreprocessor

def create_notebook(title, description, code_cells, filename):
    nb = nbf.v4.new_notebook()

    # Header
    header_md = f"""# {title}

{description}

## Objectives

*   Load and explore the dataset.
*   Visualize the data.
*   Train the model using Scikit-Learn.
*   Evaluate the model performance.
"""
    nb.cells.append(nbf.v4.new_markdown_cell(header_md))

    # Imports
    imports_code = """import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

# Ensures that plots are displayed directly inside the notebook
%matplotlib inline"""
    nb.cells.append(nbf.v4.new_code_cell(imports_code))

    # Add project specific cells
    for cell_type, content in code_cells:
        if cell_type == 'markdown':
            nb.cells.append(nbf.v4.new_markdown_cell(content))
        else:
            nb.cells.append(nbf.v4.new_code_cell(content))

    # Execute the notebook
    ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
    try:
        ep.preprocess(nb, {'metadata': {'path': os.path.dirname(filename)}})
    except Exception as e:
        print(f"Error executing notebook {filename}: {e}")

    # Write to file
    with open(filename, 'w') as f:
        nbf.write(nb, f)
    print(f"Created {filename}")

def main():
    projects = [
        {
            "name": "Multiple_Linear_Regression",
            "title": "Multiple Linear Regression",
            "desc": "Predicting housing prices using multiple features from the California Housing dataset.",
            "cells": [
                ('markdown', '## 1. Load Dataset\nWe will use the **California Housing** dataset.'),
                ('code', """from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)
df = data.frame
print(df.head())
print(df.describe())"""),
                ('markdown', '## 2. Exploratory Data Analysis\nLet\'s visualize the relationship between features.'),
                ('code', """plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()"""),
                ('markdown', '## 3. Train Model\nWe will split the data and train a Multiple Linear Regression model.'),
                ('code', """from sklearn.linear_model import LinearRegression

X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')"""),
                ('markdown', '## 4. Evaluation\nEvaluate the model using Mean Squared Error and R2 Score.'),
                ('code', """y_pred = model.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2 Score:', metrics.r2_score(y_test, y_pred))

plt.scatter(y_test, y_pred)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')
plt.show()""")
            ]
        },
        {
            "name": "Polynomial_Regression",
            "title": "Polynomial Regression",
            "desc": "Modeling non-linear relationships using Polynomial Regression on synthetic data.",
            "cells": [
                ('markdown', '## 1. Generate Data\nWe generate non-linear synthetic data.'),
                ('code', """np.random.seed(0)
X = 2 - 3 * np.random.normal(0, 1, 20)
y = X - 2 * (X ** 2) + 0.5 * (X ** 3) + np.random.normal(-3, 3, 20)
X = X[:, np.newaxis]

plt.scatter(X, y)
plt.title('Synthetic Non-Linear Data')
plt.show()"""),
                ('markdown', '## 2. Train Model\nTransform features to polynomial features and fit a Linear Regression model.'),
                ('code', """from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Transform to polynomial features
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)

# Fit model
model = LinearRegression()
model.fit(X_poly, y)
y_poly_pred = model.predict(X_poly)"""),
                ('markdown', '## 3. Visualization\nPlot the polynomial regression curve.'),
                ('code', """import operator
plt.scatter(X, y, s=10)
# sort the values of x before line plot
sort_axis = operator.itemgetter(0)
sorted_zip = sorted(zip(X,y_poly_pred), key=sort_axis)
x_plot, y_poly_plot = zip(*sorted_zip)
plt.plot(x_plot, y_poly_plot, color='m')
plt.title('Polynomial Regression Fit')
plt.show()"""),
                ('markdown', '## 4. Evaluation'),
                ('code', """print('R2 Score:', metrics.r2_score(y, y_poly_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y, y_poly_pred))""")
            ]
        },
        {
            "name": "Logistic_Regression",
            "title": "Logistic Regression",
            "desc": "Classification of Breast Cancer dataset using Logistic Regression.",
            "cells": [
                ('markdown', '## 1. Load Dataset\nWe use the **Breast Cancer Wisconsin** dataset.'),
                ('code', """from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
print(df.head())
print(df['target'].value_counts())"""),
                ('markdown', '## 2. Train Model'),
                ('code', """from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

X = data.data
y = data.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)"""),
                ('markdown', '## 3. Evaluation\nConfusion Matrix and Classification Report.'),
                ('code', """y_pred = model.predict(X_test)

print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.show()

print("Classification Report:")
print(classification_report(y_test, y_pred))""")
            ]
        },
        {
            "name": "K_Nearest_Neighbors",
            "title": "K-Nearest Neighbors (KNN)",
            "desc": "Classification using KNN on the Iris dataset.",
            "cells": [
                ('markdown', '## 1. Load Dataset\nWe use the classic **Iris** dataset.'),
                ('code', """from sklearn.datasets import load_iris
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
print(df.head())"""),
                ('markdown', '## 2. Train Model\nWe need to choose K. Let\'s start with K=5.'),
                ('code', """from sklearn.neighbors import KNeighborsClassifier

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)"""),
                ('markdown', '## 3. Evaluation'),
                ('code', """y_pred = model.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Finding optimal K
error_rate = []
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,40), error_rate, color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.show()""")
            ]
        },
        {
            "name": "Decision_Trees",
            "title": "Decision Trees",
            "desc": "Classification using Decision Trees on the Wine dataset.",
            "cells": [
                ('markdown', '## 1. Load Dataset\nWe use the **Wine** dataset.'),
                ('code', """from sklearn.datasets import load_wine
data = load_wine()
X = data.data
y = data.target
print(f"Features: {data.feature_names}")"""),
                ('markdown', '## 2. Train Model'),
                ('code', """from sklearn.tree import DecisionTreeClassifier, plot_tree

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = DecisionTreeClassifier(criterion='entropy', max_depth=3)
model.fit(X_train, y_train)"""),
                ('markdown', '## 3. Visualization and Evaluation'),
                ('code', """plt.figure(figsize=(15,10))
plot_tree(model, feature_names=data.feature_names, class_names=data.target_names, filled=True)
plt.show()

y_pred = model.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))""")
            ]
        },
        {
            "name": "Support_Vector_Machines",
            "title": "Support Vector Machines (SVM)",
            "desc": "Classification using SVM on the Breast Cancer dataset.",
            "cells": [
                ('markdown', '## 1. Load Dataset'),
                ('code', """from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target"""),
                ('markdown', '## 2. Train Model'),
                ('code', """from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = SVC(kernel='linear') # Try 'rbf' or 'poly' as well
model.fit(X_train, y_train)"""),
                ('markdown', '## 3. Evaluation'),
                ('code', """y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))""")
            ]
        },
        {
            "name": "Clustering_KMeans",
            "title": "Clustering with K-Means",
            "desc": "Unsupervised learning to cluster synthetic data.",
            "cells": [
                ('markdown', '## 1. Generate Data'),
                ('code', """from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
plt.scatter(X[:,0], X[:,1])
plt.show()"""),
                ('markdown', '## 2. Train Model\nWe use the Elbow Method to find the optimal number of clusters.'),
                ('code', """from sklearn.cluster import KMeans

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()"""),
                ('markdown', '## 3. Visualize Clusters'),
                ('code', """kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(X)

plt.scatter(X[:,0], X[:,1], c=pred_y, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.title('Clusters with Centroids')
plt.show()""")
            ]
        },
        {
            "name": "Random_Forest",
            "title": "Random Forest",
            "desc": "Ensemble learning using Random Forest on the Digits dataset.",
            "cells": [
                ('markdown', '## 1. Load Dataset\nWe use the **Digits** dataset for handwritten digit recognition.'),
                ('code', """from sklearn.datasets import load_digits
data = load_digits()
X = data.data
y = data.target

# Visualize the first 4 images
plt.gray()
fig, axes = plt.subplots(1, 4, figsize=(10, 3))
for ax, image, label in zip(axes, data.images[:4], data.target[:4]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title(f'Training: {label}')
plt.show()"""),
                ('markdown', '## 2. Train Model'),
                ('code', """from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)"""),
                ('markdown', '## 3. Evaluation'),
                ('code', """y_pred = model.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.show()""")
            ]
        }
    ]

    for project in projects:
        directory = project["name"]
        if not os.path.exists(directory):
            os.makedirs(directory)

        filename = os.path.join(directory, f"{project['name']}.ipynb")
        create_notebook(project['title'], project['desc'], project['cells'], filename)

if __name__ == "__main__":
    main()