28
28
class KMeansLocal {
29
29
30
30
// the minimum distance that is considered to be "far enough" to a centroid in order to compute the soar distance.
31
- // For vectors that are closer than this distance to the centroid, we use the squared distance to find the
32
- // second closest centroid.
31
+ // For vectors that are closer than this distance to the centroid don't get spilled because they are well represented
32
+ // by the centroid itself. In many cases, it indicates a degenerated distribution, e.g the cluster is composed of the
33
+ // many equal vectors.
33
34
private static final float SOAR_MIN_DISTANCE = 1e-16f ;
34
35
35
36
final int sampleSize ;
@@ -281,19 +282,18 @@ private void assignSpilled(
281
282
final float [] distances = new float [4 ];
282
283
for (int i = 0 ; i < vectors .size (); i ++) {
283
284
float [] vector = vectors .vectorValue (i );
284
-
285
285
int currAssignment = assignments [i ];
286
286
float [] currentCentroid = centroids [currAssignment ];
287
-
288
287
// TODO: cache these?
289
288
float vectorCentroidDist = VectorUtil .squareDistance (vector , currentCentroid );
290
-
291
- if (vectorCentroidDist > SOAR_MIN_DISTANCE ) {
292
- for (int j = 0 ; j < vectors .dimension (); j ++) {
293
- diffs [j ] = vector [j ] - currentCentroid [j ];
294
- }
289
+ if (vectorCentroidDist <= SOAR_MIN_DISTANCE ) {
290
+ spilledAssignments [i ] = -1 ; // no SOAR assignment
291
+ continue ;
295
292
}
296
293
294
+ for (int j = 0 ; j < vectors .dimension (); j ++) {
295
+ diffs [j ] = vector [j ] - currentCentroid [j ];
296
+ }
297
297
final int centroidCount ;
298
298
final IntToIntFunction centroidOrds ;
299
299
if (neighborhoods != null ) {
@@ -310,29 +310,17 @@ private void assignSpilled(
310
310
float minSoar = Float .MAX_VALUE ;
311
311
int j = 0 ;
312
312
for (; j < limit ; j += 4 ) {
313
- if (vectorCentroidDist > SOAR_MIN_DISTANCE ) {
314
- ESVectorUtil .soarDistanceBulk (
315
- vector ,
316
- centroids [centroidOrds .apply (j )],
317
- centroids [centroidOrds .apply (j + 1 )],
318
- centroids [centroidOrds .apply (j + 2 )],
319
- centroids [centroidOrds .apply (j + 3 )],
320
- diffs ,
321
- soarLambda ,
322
- vectorCentroidDist ,
323
- distances
324
- );
325
- } else {
326
- // if the vector is very close to the centroid, we look for the second-nearest centroid
327
- ESVectorUtil .squareDistanceBulk (
328
- vector ,
329
- centroids [centroidOrds .apply (j )],
330
- centroids [centroidOrds .apply (j + 1 )],
331
- centroids [centroidOrds .apply (j + 2 )],
332
- centroids [centroidOrds .apply (j + 3 )],
333
- distances
334
- );
335
- }
313
+ ESVectorUtil .soarDistanceBulk (
314
+ vector ,
315
+ centroids [centroidOrds .apply (j )],
316
+ centroids [centroidOrds .apply (j + 1 )],
317
+ centroids [centroidOrds .apply (j + 2 )],
318
+ centroids [centroidOrds .apply (j + 3 )],
319
+ diffs ,
320
+ soarLambda ,
321
+ vectorCentroidDist ,
322
+ distances
323
+ );
336
324
for (int k = 0 ; k < distances .length ; k ++) {
337
325
float soar = distances [k ];
338
326
if (soar < minSoar ) {
@@ -344,13 +332,7 @@ private void assignSpilled(
344
332
345
333
for (; j < centroidCount ; j ++) {
346
334
int centroidOrd = centroidOrds .apply (j );
347
- float soar ;
348
- if (vectorCentroidDist > SOAR_MIN_DISTANCE ) {
349
- soar = ESVectorUtil .soarDistance (vector , centroids [centroidOrd ], diffs , soarLambda , vectorCentroidDist );
350
- } else {
351
- // if the vector is very close to the centroid, we look for the second-nearest centroid
352
- soar = VectorUtil .squareDistance (vector , centroids [centroidOrd ]);
353
- }
335
+ float soar = ESVectorUtil .soarDistance (vector , centroids [centroidOrd ], diffs , soarLambda , vectorCentroidDist );
354
336
if (soar < minSoar ) {
355
337
minSoar = soar ;
356
338
bestAssignment = centroidOrd ;
0 commit comments