2828class KMeansLocal {
2929
3030 // the minimum distance that is considered to be "far enough" to a centroid in order to compute the soar distance.
31- // For vectors that are closer than this distance to the centroid, we use the squared distance to find the
32- // second closest centroid.
31+ // For vectors that are closer than this distance to the centroid don't get spilled because they are well represented
32+ // by the centroid itself. In many cases, it indicates a degenerated distribution, e.g the cluster is composed of the
33+ // many equal vectors.
3334 private static final float SOAR_MIN_DISTANCE = 1e-16f ;
3435
3536 final int sampleSize ;
@@ -281,19 +282,18 @@ private void assignSpilled(
281282 final float [] distances = new float [4 ];
282283 for (int i = 0 ; i < vectors .size (); i ++) {
283284 float [] vector = vectors .vectorValue (i );
284-
285285 int currAssignment = assignments [i ];
286286 float [] currentCentroid = centroids [currAssignment ];
287-
288287 // TODO: cache these?
289288 float vectorCentroidDist = VectorUtil .squareDistance (vector , currentCentroid );
290-
291- if (vectorCentroidDist > SOAR_MIN_DISTANCE ) {
292- for (int j = 0 ; j < vectors .dimension (); j ++) {
293- diffs [j ] = vector [j ] - currentCentroid [j ];
294- }
289+ if (vectorCentroidDist <= SOAR_MIN_DISTANCE ) {
290+ spilledAssignments [i ] = -1 ; // no SOAR assignment
291+ continue ;
295292 }
296293
294+ for (int j = 0 ; j < vectors .dimension (); j ++) {
295+ diffs [j ] = vector [j ] - currentCentroid [j ];
296+ }
297297 final int centroidCount ;
298298 final IntToIntFunction centroidOrds ;
299299 if (neighborhoods != null ) {
@@ -310,29 +310,17 @@ private void assignSpilled(
310310 float minSoar = Float .MAX_VALUE ;
311311 int j = 0 ;
312312 for (; j < limit ; j += 4 ) {
313- if (vectorCentroidDist > SOAR_MIN_DISTANCE ) {
314- ESVectorUtil .soarDistanceBulk (
315- vector ,
316- centroids [centroidOrds .apply (j )],
317- centroids [centroidOrds .apply (j + 1 )],
318- centroids [centroidOrds .apply (j + 2 )],
319- centroids [centroidOrds .apply (j + 3 )],
320- diffs ,
321- soarLambda ,
322- vectorCentroidDist ,
323- distances
324- );
325- } else {
326- // if the vector is very close to the centroid, we look for the second-nearest centroid
327- ESVectorUtil .squareDistanceBulk (
328- vector ,
329- centroids [centroidOrds .apply (j )],
330- centroids [centroidOrds .apply (j + 1 )],
331- centroids [centroidOrds .apply (j + 2 )],
332- centroids [centroidOrds .apply (j + 3 )],
333- distances
334- );
335- }
313+ ESVectorUtil .soarDistanceBulk (
314+ vector ,
315+ centroids [centroidOrds .apply (j )],
316+ centroids [centroidOrds .apply (j + 1 )],
317+ centroids [centroidOrds .apply (j + 2 )],
318+ centroids [centroidOrds .apply (j + 3 )],
319+ diffs ,
320+ soarLambda ,
321+ vectorCentroidDist ,
322+ distances
323+ );
336324 for (int k = 0 ; k < distances .length ; k ++) {
337325 float soar = distances [k ];
338326 if (soar < minSoar ) {
@@ -344,13 +332,7 @@ private void assignSpilled(
344332
345333 for (; j < centroidCount ; j ++) {
346334 int centroidOrd = centroidOrds .apply (j );
347- float soar ;
348- if (vectorCentroidDist > SOAR_MIN_DISTANCE ) {
349- soar = ESVectorUtil .soarDistance (vector , centroids [centroidOrd ], diffs , soarLambda , vectorCentroidDist );
350- } else {
351- // if the vector is very close to the centroid, we look for the second-nearest centroid
352- soar = VectorUtil .squareDistance (vector , centroids [centroidOrd ]);
353- }
335+ float soar = ESVectorUtil .soarDistance (vector , centroids [centroidOrd ], diffs , soarLambda , vectorCentroidDist );
354336 if (soar < minSoar ) {
355337 minSoar = soar ;
356338 bestAssignment = centroidOrd ;
0 commit comments