Skip to content

Commit 6247f83

Browse files
Lakhan-Nadkvedala
andauthored
feat: K Means Clustering Added (#589)
* feat: K Means Clustering Added Implemented K Means Clustering in C The data set used for implementing is ordered pair(x,y) in 2D plane. * Update k_means_clustering.c Lint suggested changes * Update k_means_clustering.c Lint suggested changes * Update k_means_clustering.c * Update k_means_clustering.c Added float headers and also included macro _USE_MATH_DEFINES * Update machine_learning/k_means_clustering.c Co-authored-by: Krishna Vedala <[email protected]> * update: change in docs and a new test added * Update machine_learning/k_means_clustering.c Co-authored-by: Krishna Vedala <[email protected]> * Update machine_learning/k_means_clustering.c Co-authored-by: Krishna Vedala <[email protected]> * update: scale down rand() before multiplication * update: image width specifid and documentation grouped * update: lint suggested changes Co-authored-by: Krishna Vedala <[email protected]>
1 parent 37dede4 commit 6247f83

File tree

1 file changed

+335
-0
lines changed

1 file changed

+335
-0
lines changed

machine_learning/k_means_clustering.c

Lines changed: 335 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,335 @@
1+
/**
2+
* @file k_means_clustering.cpp
3+
* @brief K Means Clustering Algorithm implemented
4+
* @details
5+
* This file has K Means algorithm implemmented
6+
* It prints test output in eps format
7+
*
8+
* Note:
9+
* Though the code for clustering works for all the
10+
* 2D data points and can be extended for any size vector
11+
* by making the required changes, but note that
12+
* the output method i.e. printEPS is only good for
13+
* polar data points i.e. in a circle and both test
14+
* use the same.
15+
* @author [Lakhan Nad](https://github.com/Lakhan-Nad)
16+
*/
17+
18+
#define _USE_MATH_DEFINES /* required for MS Visual C */
19+
#include <float.h> /* DBL_MAX, DBL_MIN */
20+
#include <math.h> /* PI, sin, cos */
21+
#include <stdio.h> /* printf */
22+
#include <stdlib.h> /* rand */
23+
#include <string.h> /* memset */
24+
#include <time.h> /* time */
25+
26+
/*!
27+
* @addtogroup machine_learning Machine Learning Algorithms
28+
* @{
29+
* @addtogroup k_means K-Means Clustering Algorithm
30+
* @{
31+
*/
32+
33+
/*! @struct observation
34+
* a class to store points in 2d plane
35+
* the name observation is used to denote
36+
* a random point in plane
37+
*/
38+
typedef struct observation {
39+
double x; /**< abscissa of 2D data point */
40+
double y; /**< ordinate of 2D data point */
41+
int group; /**< the group no in which this observation would go */
42+
} observation;
43+
44+
/*! @struct cluster
45+
* this class stores the coordinates
46+
* of centroid of all the points
47+
* in that cluster it also
48+
* stores the count of observations
49+
* belonging to this cluster
50+
*/
51+
typedef struct cluster {
52+
double x; /**< abscissa centroid of this cluster */
53+
double y; /**< ordinate of centroid of this cluster */
54+
size_t count; /**< count of observations present in this cluster */
55+
} cluster;
56+
57+
/*! @fn calculateNearest
58+
* Returns the index of centroid nearest to
59+
* given observation
60+
*
61+
* @param o observation
62+
* @param clusters array of cluster having centroids coordinates
63+
* @param k size of clusters array
64+
*
65+
* @returns the index of nearest centroid for given observation
66+
*/
67+
int calculateNearst(observation* o, cluster clusters[], int k) {
68+
double minD = DBL_MAX;
69+
double dist = 0;
70+
int index = -1;
71+
int i = 0;
72+
for (; i < k; i++) {
73+
/* Calculate Squared Distance*/
74+
dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) +
75+
(clusters[i].y - o->y) * (clusters[i].y - o->y);
76+
if (dist < minD) {
77+
minD = dist;
78+
index = i;
79+
}
80+
}
81+
return index;
82+
}
83+
84+
/*! @fn calculateCentroid
85+
* Calculate centoid and assign it to the cluster variable
86+
*
87+
* @param observations an array of observations whose centroid is calculated
88+
* @param size size of the observations array
89+
* @param centroid a reference to cluster object to store information of
90+
* centroid
91+
*/
92+
void calculateCentroid(observation observations[], size_t size,
93+
cluster* centroid) {
94+
size_t i = 0;
95+
centroid->x = 0;
96+
centroid->y = 0;
97+
centroid->count = size;
98+
for (; i < size; i++) {
99+
centroid->x += observations[i].x;
100+
centroid->y += observations[i].y;
101+
observations[i].group = 0;
102+
}
103+
centroid->x /= centroid->count;
104+
centroid->y /= centroid->count;
105+
}
106+
107+
/*! @fn kMeans
108+
* --K Means Algorithm--
109+
* 1. Assign each observation to one of k groups
110+
* creating a random initial clustering
111+
* 2. Find the centroid of observations for each
112+
* cluster to form new centroids
113+
* 3. Find the centroid which is nearest for each
114+
* observation among the calculated centroids
115+
* 4. Assign the observation to its nearest centroid
116+
* to create a new clustering.
117+
* 5. Repeat step 2,3,4 until there is no change
118+
* the current clustering and is same as last
119+
* clustering.
120+
* @param observations an array of observations to cluster
121+
* @param size size of observations array
122+
* @param k no of clusters to be made
123+
*
124+
* @returns pointer to cluster object
125+
*/
126+
cluster* kMeans(observation observations[], size_t size, int k) {
127+
cluster* clusters = NULL;
128+
if (k <= 1) {
129+
/*
130+
If we have to cluster them only in one group
131+
then calculate centroid of observations and
132+
that will be a ingle cluster
133+
*/
134+
clusters = (cluster*)malloc(sizeof(cluster));
135+
memset(clusters, 0, sizeof(cluster));
136+
calculateCentroid(observations, size, clusters);
137+
} else if (k < size) {
138+
clusters = malloc(sizeof(cluster) * k);
139+
memset(clusters, 0, k * sizeof(cluster));
140+
/* STEP 1 */
141+
for (size_t j = 0; j < size; j++) {
142+
observations[j].group = rand() % k;
143+
}
144+
size_t changed = 0;
145+
size_t minAcceptedError =
146+
size / 10000; // Do until 99.99 percent points are in correct cluster
147+
int t = 0;
148+
do {
149+
/* Initialize clusters */
150+
for (int i = 0; i < k; i++) {
151+
clusters[i].x = 0;
152+
clusters[i].y = 0;
153+
clusters[i].count = 0;
154+
}
155+
/* STEP 2*/
156+
for (size_t j = 0; j < size; j++) {
157+
t = observations[j].group;
158+
clusters[t].x += observations[j].x;
159+
clusters[t].y += observations[j].y;
160+
clusters[t].count++;
161+
}
162+
for (int i = 0; i < k; i++) {
163+
clusters[i].x /= clusters[i].count;
164+
clusters[i].y /= clusters[i].count;
165+
}
166+
/* STEP 3 and 4 */
167+
changed = 0; // this variable stores change in clustering
168+
for (size_t j = 0; j < size; j++) {
169+
t = calculateNearst(observations + j, clusters, k);
170+
if (t != observations[j].group) {
171+
changed++;
172+
observations[j].group = t;
173+
}
174+
}
175+
} while (changed > minAcceptedError); // Keep on grouping until we have
176+
// got almost best clustering
177+
} else {
178+
/* If no of clusters is more than observations
179+
each observation can be its own cluster
180+
*/
181+
clusters = (cluster*)malloc(sizeof(cluster) * k);
182+
memset(clusters, 0, k * sizeof(cluster));
183+
for (int j = 0; j < size; j++) {
184+
clusters[j].x = observations[j].x;
185+
clusters[j].y = observations[j].y;
186+
clusters[j].count = 1;
187+
observations[j].group = j;
188+
}
189+
}
190+
return clusters;
191+
}
192+
193+
/**
194+
* @}
195+
* @}
196+
*/
197+
198+
/*! @fn printEPS
199+
* A function to print observations and clusters
200+
* The code is taken from
201+
* @link http://rosettacode.org/wiki/K-means%2B%2B_clustering
202+
* its C implementation
203+
* Even the K Means code is also inspired from it
204+
*
205+
* Note: To print in a file use pipeline operator ( ./a.out > image.eps )
206+
*
207+
* @param observations observations array
208+
* @param len size of observation array
209+
* @param cent clusters centroid's array
210+
* @param k size of cent array
211+
*/
212+
void printEPS(observation pts[], size_t len, cluster cent[], int k) {
213+
int W = 400, H = 400;
214+
double min_x = DBL_MAX, max_x = DBL_MIN, min_y = DBL_MAX, max_y = DBL_MIN;
215+
double scale = 0, cx = 0, cy = 0;
216+
double* colors = (double*)malloc(sizeof(double) * (k * 3));
217+
int i;
218+
size_t j;
219+
double kd = k * 1.0;
220+
for (i = 0; i < k; i++) {
221+
*(colors + 3 * i) = (3 * (i + 1) % k) / kd;
222+
*(colors + 3 * i + 1) = (7 * i % k) / kd;
223+
*(colors + 3 * i + 2) = (9 * i % k) / kd;
224+
}
225+
226+
for (j = 0; j < len; j++) {
227+
if (max_x < pts[j].x) max_x = pts[j].x;
228+
if (min_x > pts[j].x) min_x = pts[j].x;
229+
if (max_y < pts[j].y) max_y = pts[j].y;
230+
if (min_y > pts[j].y) min_y = pts[j].y;
231+
}
232+
scale = W / (max_x - min_x);
233+
if (scale > (H / (max_y - min_y))) {
234+
scale = H / (max_y - min_y);
235+
};
236+
cx = (max_x + min_x) / 2;
237+
cy = (max_y + min_y) / 2;
238+
239+
printf("%%!PS-Adobe-3.0 EPSF-3.0\n%%%%BoundingBox: -5 -5 %d %d\n", W + 10,
240+
H + 10);
241+
printf(
242+
"/l {rlineto} def /m {rmoveto} def\n"
243+
"/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def\n"
244+
"/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath "
245+
" gsave 1 setgray fill grestore gsave 3 setlinewidth"
246+
" 1 setgray stroke grestore 0 setgray stroke }def\n");
247+
for (int i = 0; i < k; i++) {
248+
printf("%g %g %g setrgbcolor\n", *(colors + 3 * i), *(colors + 3 * i + 1),
249+
*(colors + 3 * i + 2));
250+
for (j = 0; j < len; j++) {
251+
if (pts[j].group != i) continue;
252+
printf("%.3f %.3f c\n", (pts[j].x - cx) * scale + W / 2,
253+
(pts[j].y - cy) * scale + H / 2);
254+
}
255+
printf("\n0 setgray %g %g s\n", (cent[i].x - cx) * scale + W / 2,
256+
(cent[i].y - cy) * scale + H / 2);
257+
}
258+
printf("\n%%%%EOF");
259+
260+
// free accquired memory
261+
free(colors);
262+
}
263+
264+
/*! @fn test
265+
* A function to test the kMeans function
266+
* Generates 100000 points in a circle of
267+
* radius 20.0 with center at (0,0)
268+
* and cluster them into 5 clusters
269+
*
270+
* <img alt="Output for 100000 points divided in 5 clusters" src=
271+
* "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest1.png"
272+
* width="400px" heiggt="400px">
273+
*/
274+
static void test() {
275+
size_t size = 100000L;
276+
observation* observations = (observation*)malloc(sizeof(observation) * size);
277+
double maxRadius = 20.00;
278+
double radius = 0;
279+
double ang = 0;
280+
size_t i = 0;
281+
for (; i < size; i++) {
282+
radius = maxRadius * ((double)rand() / RAND_MAX);
283+
ang = 2 * M_PI * ((double)rand() / RAND_MAX);
284+
observations[i].x = radius * cos(ang);
285+
observations[i].y = radius * sin(ang);
286+
}
287+
int k = 5; // No of clusters
288+
cluster* clusters = kMeans(observations, size, k);
289+
printEPS(observations, size, clusters, k);
290+
// Free the accquired memory
291+
free(observations);
292+
free(clusters);
293+
}
294+
295+
/*! @fn test2
296+
* A function to test the kMeans function
297+
* Generates 1000000 points in a circle of
298+
* radius 20.0 with center at (0,0)
299+
* and cluster them into 11 clusters
300+
*
301+
* <img alt="Output for 1000000 points divided in 11 clusters" src=
302+
* "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest2.png"
303+
* width="400px" heiggt="400px">
304+
*/
305+
void test2() {
306+
size_t size = 1000000L;
307+
observation* observations = (observation*)malloc(sizeof(observation) * size);
308+
double maxRadius = 20.00;
309+
double radius = 0;
310+
double ang = 0;
311+
size_t i = 0;
312+
for (; i < size; i++) {
313+
radius = maxRadius * ((double)rand() / RAND_MAX);
314+
ang = 2 * M_PI * ((double)rand() / RAND_MAX);
315+
observations[i].x = radius * cos(ang);
316+
observations[i].y = radius * sin(ang);
317+
}
318+
int k = 11; // No of clusters
319+
cluster* clusters = kMeans(observations, size, k);
320+
printEPS(observations, size, clusters, k);
321+
// Free the accquired memory
322+
free(observations);
323+
free(clusters);
324+
}
325+
326+
/*! @fn main
327+
* This function calls the test
328+
* function
329+
*/
330+
int main() {
331+
srand(time(NULL));
332+
test();
333+
/* test2(); */
334+
return 0;
335+
}

0 commit comments

Comments
 (0)