@@ -105,8 +105,8 @@ public static <R> void assignAndRecompute(List<Weighted<Integer>> sampledPoints,
105105 double minDist = Double .MAX_VALUE ;
106106 int minDistNbr = -1 ;
107107 for (int i = 0 ; i < clusters .size (); i ++) {
108+ // will check for negative distances
108109 dist [i ] = clusters .get (i ).distance (getPoint .apply (point .index ), distance );
109- checkArgument (dist [i ] >= 0 , "distance cannot be negative" );
110110 if (minDist > dist [i ]) {
111111 minDist = dist [i ];
112112 minDistNbr = i ;
@@ -201,10 +201,18 @@ public static <R> List<ICluster<R>> iterativeClustering(int maxAllowed, int init
201201 boolean phase2GlobalReassign , double overlapParameter , List <ICluster <R >> previousClustering ) {
202202
203203 checkArgument (refs .size () > 0 , "empty list, nothing to do" );
204- checkArgument (maxAllowed >= stopAt && stopAt > 0 , "incorrect bounds on number of clusters" );
204+ checkArgument (stopAt > 0 , "has to stop at 1 cluster" );
205+ checkArgument (stopAt <= maxAllowed , "cannot stop before achieving the limit" );
205206
206207 Random rng = new Random (seed );
207- double sampledSum = refs .stream ().map (e -> (double ) e .weight ).reduce (Double ::sum ).get ();
208+ double sampledSum = refs .stream ().map (e -> {
209+ checkArgument (!Double .isNaN (e .weight ), " weights have to be non-NaN" );
210+ checkArgument (Double .isFinite (e .weight ), " weights have to be finite" );
211+ checkArgument (e .weight >= 0.0 , () -> "negative weights are not meaningful" + e .weight );
212+ return (double ) e .weight ;
213+ }).reduce (0.0 , Double ::sum );
214+ checkArgument (sampledSum > 0 , " total weight has to be positive" );
215+
208216 ArrayList <ICluster <R >> centers = new ArrayList <>();
209217 if (refs .size () < 10 * (initial + 5 )) {
210218 for (Weighted <Integer > point : refs ) {
@@ -294,6 +302,8 @@ public static <R> List<ICluster<R>> iterativeClustering(int maxAllowed, int init
294302 }
295303 centers .sort (Comparator .comparingDouble (ICluster ::getWeight ));
296304 while (centers .get (0 ).getWeight () == 0.0 ) {
305+ // this line is reachable via zeroTest() in
306+ // SampleSummaryTest
297307 centers .remove (0 );
298308 }
299309 if (inital < 1.2 * maxAllowed + 1 ) {
@@ -345,14 +355,14 @@ public static <R> List<ICluster<R>> summarize(List<Weighted<R>> points, int maxA
345355 List <ICluster <R >> previousClustering ) {
346356 checkArgument (maxAllowed < 100 , "are you sure you want more elements in the summary?" );
347357 checkArgument (maxAllowed <= initial , "initial parameter should be at least maximum allowed in final result" );
348- checkArgument (stopAt > 0 && stopAt <= maxAllowed , "lower bound set incorrectly" );
349358
350359 double totalWeight = points .stream ().map (e -> {
351- checkArgument (e .weight >= 0.0 , "negative weights are not meaningful" );
360+ checkArgument (!Double .isNaN (e .weight ), " weights have to be non-NaN" );
361+ checkArgument (Double .isFinite (e .weight ), " weights have to be finite" );
362+ checkArgument (e .weight >= 0.0 , () -> "negative weights are not meaningful" + e .weight );
352363 return (double ) e .weight ;
353364 }).reduce (0.0 , Double ::sum );
354- checkArgument (!Double .isNaN (totalWeight ) && Double .isFinite (totalWeight ),
355- " weights have to finite and non-NaN" );
365+ checkArgument (totalWeight > 0 , " total weight has to be positive" );
356366 Random rng = new Random (seed );
357367 // the following list is explicity copied and sorted for potential efficiency
358368 List <Weighted <R >> sampledPoints = createSample (points , rng .nextLong (), 5 * LENGTH_BOUND , 0.005 , 1.0 );
@@ -363,8 +373,6 @@ public static <R> List<ICluster<R>> summarize(List<Weighted<R>> points, int maxA
363373 }
364374
365375 Function <Integer , R > getPoint = (i ) -> sampledPoints .get (i ).index ;
366- checkArgument (sampledPoints .size () > 0 , "empty list, nothing to do" );
367- double sampledSum = sampledPoints .stream ().map (e -> (double ) e .weight ).reduce (Double ::sum ).get ();
368376
369377 return iterativeClustering (maxAllowed , initial , stopAt , refs , getPoint , distance , clusterInitializer ,
370378 rng .nextLong (), parallelEnabled , phase2GlobalReassign , overlapParameter , previousClustering );
@@ -403,11 +411,13 @@ public static SampleSummary summarize(List<Weighted<float[]>> points, int maxAll
403411 checkArgument (maxAllowed <= initial , "initial parameter should be at least maximum allowed in final result" );
404412
405413 double totalWeight = points .stream ().map (e -> {
406- checkArgument (e .weight >= 0.0 , "negative weights are not meaningful" );
414+ checkArgument (!Double .isNaN (e .weight ), " weights have to be non-NaN" );
415+ checkArgument (Double .isFinite (e .weight ), " weights have to be finite" );
416+ checkArgument (e .weight >= 0.0 , () -> "negative weights are not meaningful" + e .weight );
407417 return (double ) e .weight ;
408418 }).reduce (0.0 , Double ::sum );
409- checkArgument (! Double . isNaN ( totalWeight ) && Double . isFinite ( totalWeight ),
410- " weights have to finite and non-NaN" );
419+ checkArgument (totalWeight > 0 , " total weight has to be positive" );
420+
411421 Random rng = new Random (seed );
412422 // the following list is explicity copied and sorted for potential efficiency
413423 List <Weighted <float []>> sampledPoints = createSample (points , rng .nextLong (), 5 * LENGTH_BOUND , 0.005 , 1.0 );
@@ -458,24 +468,24 @@ public static SampleSummary summarize(float[][] points, int maxAllowed, int init
458468 * @param maxAllowed maximum number of groups/clusters
459469 * @param initial a parameter controlling the initialization
460470 * @param reassignPerStep if reassignment is to be performed each step
471+ * @param seed random seed
461472 * @return a summarization
462473 */
463- public static SampleSummary summarize (List <Weighted <float []>> points , int maxAllowed , int initial ,
464- boolean reassignPerStep ) {
465- return summarize (points , maxAllowed , initial , reassignPerStep , Summarizer ::L2distance , new Random ().nextLong (),
466- false );
474+ public static SampleSummary l2summarize (List <Weighted <float []>> points , int maxAllowed , int initial ,
475+ boolean reassignPerStep , long seed ) {
476+ return summarize (points , maxAllowed , initial , reassignPerStep , Summarizer ::L2distance , seed , false );
467477 }
468478
469479 /**
470480 * Same as above, with the most common use cases filled in
471481 *
472482 * @param points points in float[][], each of weight 1.0
473483 * @param maxAllowed maximum number of clusters one is interested in
484+ * @param seed random seed
474485 * @return a summarization
475486 */
476- public static SampleSummary summarize (float [][] points , int maxAllowed ) {
477- return summarize (points , maxAllowed , 4 * maxAllowed , false , Summarizer ::L2distance , new Random ().nextLong (),
478- false );
487+ public static SampleSummary l2summarize (float [][] points , int maxAllowed , long seed ) {
488+ return summarize (points , maxAllowed , 4 * maxAllowed , false , Summarizer ::L2distance , seed , false );
479489 }
480490
481491 /**
@@ -529,9 +539,9 @@ public static <R> List<ICluster<R>> multiSummarize(R[] points, int maxAllowed, i
529539 clusterInitializer , seed , parallelEnabled , null );
530540 }
531541
532- // same as above, with defaults
542+ // same as above, with multicenter instead of generic
533543 public static List <ICluster <float []>> multiSummarize (float [][] points , int maxAllowed , double shrinkage ,
534- int numberOfRepresentatives ) {
544+ int numberOfRepresentatives , long seed ) {
535545
536546 ArrayList <Weighted <float []>> weighted = new ArrayList <>();
537547 for (float [] point : points ) {
@@ -540,7 +550,7 @@ public static List<ICluster<float[]>> multiSummarize(float[][] points, int maxAl
540550 BiFunction <float [], Float , ICluster <float []>> clusterInitializer = (a , b ) -> MultiCenter .initialize (a , b ,
541551 shrinkage , numberOfRepresentatives );
542552 return summarize (weighted , maxAllowed , 4 * maxAllowed , 1 , true , DEFAULT_SEPARATION_RATIO_FOR_MERGE ,
543- Summarizer ::L2distance , clusterInitializer , new Random (). nextLong () , true , null );
553+ Summarizer ::L2distance , clusterInitializer , seed , true , null );
544554 }
545555
546556}
0 commit comments