Skip to content

Commit dd212c4

Browse files
committed
Add Gaussian Mixture Model (GMM) algorithm
1 parent e2a78d4 commit dd212c4

File tree

1 file changed

+158
-0
lines changed

1 file changed

+158
-0
lines changed
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
"""
2+
README, Author - Md Ruman Islam (mailto:ruman23.github.io)
3+
Requirements:
4+
- numpy
5+
- matplotlib
6+
Python:
7+
- 3.8+
8+
Inputs:
9+
- X : a 2D numpy array of features.
10+
- n_components : number of Gaussian distributions (clusters) to fit.
11+
- max_iter : maximum number of EM iterations.
12+
- tol : convergence tolerance.
13+
Usage:
14+
1. define 'n_components' value and 'X' features array
15+
2. initialize model:
16+
gmm = GaussianMixture(n_components=3, max_iter=100)
17+
3. fit model to data:
18+
gmm.fit(X)
19+
4. get cluster predictions:
20+
labels = gmm.predict(X)
21+
5. visualize results:
22+
gmm.plot_results(X)
23+
"""
24+
25+
import warnings
26+
import numpy as np
27+
import matplotlib.pyplot as plt
28+
from scipy.stats import multivariate_normal
29+
30+
warnings.filterwarnings("ignore")
31+
32+
TAG = "GAUSSIAN-MIXTURE/ "
33+
34+
35+
class GaussianMixture:
36+
"""
37+
Gaussian Mixture Model implemented using the Expectation-Maximization algorithm.
38+
"""
39+
40+
def __init__(self, n_components=2, max_iter=100, tol=1e-4, seed=None):
41+
self.n_components = n_components
42+
self.max_iter = max_iter
43+
self.tol = tol
44+
self.seed = seed
45+
46+
# parameters
47+
self.weights_ = None
48+
self.means_ = None
49+
self.covariances_ = None
50+
self.log_likelihoods_ = []
51+
52+
def _initialize_parameters(self, X):
53+
"""Randomly initialize means, covariances, and mixture weights"""
54+
rng = np.random.default_rng(self.seed)
55+
n_samples, n_features = X.shape
56+
57+
indices = rng.choice(n_samples, self.n_components, replace=False)
58+
self.means_ = X[indices]
59+
60+
self.covariances_ = np.array(
61+
[np.cov(X, rowvar=False) for _ in range(self.n_components)]
62+
)
63+
self.weights_ = np.ones(self.n_components) / self.n_components
64+
65+
def _e_step(self, X):
66+
"""Compute responsibilities (posterior probabilities)"""
67+
n_samples = X.shape[0]
68+
responsibilities = np.zeros((n_samples, self.n_components))
69+
70+
for k in range(self.n_components):
71+
rv = multivariate_normal(mean=self.means_[k], cov=self.covariances_[k])
72+
responsibilities[:, k] = self.weights_[k] * rv.pdf(X)
73+
74+
# Normalize to get probabilities
75+
responsibilities /= responsibilities.sum(axis=1, keepdims=True)
76+
return responsibilities
77+
78+
def _m_step(self, X, responsibilities):
79+
"""Update weights, means, and covariances"""
80+
n_samples, n_features = X.shape
81+
Nk = responsibilities.sum(axis=0)
82+
83+
self.weights_ = Nk / n_samples
84+
self.means_ = (responsibilities.T @ X) / Nk[:, np.newaxis]
85+
86+
for k in range(self.n_components):
87+
diff = X - self.means_[k]
88+
self.covariances_[k] = (responsibilities[:, k][:, np.newaxis] * diff).T @ diff
89+
self.covariances_[k] /= Nk[k]
90+
# Add small regularization term for numerical stability
91+
self.covariances_[k] += np.eye(n_features) * 1e-6
92+
93+
def _compute_log_likelihood(self, X):
94+
"""Compute total log-likelihood of the model"""
95+
n_samples = X.shape[0]
96+
total_pdf = np.zeros((n_samples, self.n_components))
97+
98+
for k in range(self.n_components):
99+
rv = multivariate_normal(mean=self.means_[k], cov=self.covariances_[k])
100+
total_pdf[:, k] = self.weights_[k] * rv.pdf(X)
101+
102+
log_likelihood = np.sum(np.log(np.sum(total_pdf, axis=1) + 1e-12))
103+
return log_likelihood
104+
105+
def fit(self, X):
106+
"""Fit the Gaussian Mixture Model to data using the EM algorithm"""
107+
self._initialize_parameters(X)
108+
109+
prev_log_likelihood = None
110+
111+
for i in range(self.max_iter):
112+
# E-step
113+
responsibilities = self._e_step(X)
114+
115+
# M-step
116+
self._m_step(X, responsibilities)
117+
118+
# Log-likelihood
119+
log_likelihood = self._compute_log_likelihood(X)
120+
self.log_likelihoods_.append(log_likelihood)
121+
122+
if prev_log_likelihood is not None:
123+
if abs(log_likelihood - prev_log_likelihood) < self.tol:
124+
print(f"{TAG}Converged at iteration {i}.")
125+
break
126+
prev_log_likelihood = log_likelihood
127+
128+
print(f"{TAG}Training complete. Final log-likelihood: {log_likelihood:.4f}")
129+
130+
def predict(self, X):
131+
"""Predict cluster assignment for each data point"""
132+
responsibilities = self._e_step(X)
133+
return np.argmax(responsibilities, axis=1)
134+
135+
def plot_results(self, X):
136+
"""Visualize GMM clustering results (2D only)"""
137+
if X.shape[1] != 2:
138+
print(f"{TAG}Plotting only supported for 2D data.")
139+
return
140+
141+
labels = self.predict(X)
142+
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap="viridis", s=30)
143+
plt.scatter(self.means_[:, 0], self.means_[:, 1], c="red", s=100, marker="x")
144+
plt.title("Gaussian Mixture Model Clustering")
145+
plt.xlabel("Feature 1")
146+
plt.ylabel("Feature 2")
147+
plt.show()
148+
149+
150+
# Mock test
151+
if __name__ == "__main__":
152+
from sklearn.datasets import make_blobs
153+
154+
X, _ = make_blobs(n_samples=300, centers=3, cluster_std=1.2, random_state=42)
155+
gmm = GaussianMixture(n_components=3, max_iter=100, seed=42)
156+
gmm.fit(X)
157+
labels = gmm.predict(X)
158+
gmm.plot_results(X)

0 commit comments

Comments
 (0)