strong scaling, config updates, minor revisions

ethanglaser · ethanglaser · commit 4bd6c7f91e0d · 2024-09-18T18:01:07.000Z
diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
@@ -4,7 +4,7 @@
         "spmd basicstats parameters": {
             "algorithm": {
                 "estimator": "BasicStatistics",
-                "estimator_methods": { "training": "compute" }
+                "estimator_methods": { "training": "fit" }
             },
 	    "data": {
 		"split_kwargs": { "test_size": 0.0001 }
@@ -21,7 +21,7 @@
         "basicstats": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
 		"synthetic data",
                 "spmd basicstats parameters"
             ]
diff --git a/configs/spmd/large_scale/basic_stats_strong.json b/configs/spmd/large_scale/basic_stats_strong.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "BasicStatistics",
+                "estimator_methods": { "training": "fit" }
+            },
+	    "data": {
+		"split_kwargs": { "test_size": 0.0001 }
+	    }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
@@ -21,7 +21,7 @@
         "covariance": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
 		"synthetic data",
                 "spmd basicstats parameters"
             ]
diff --git a/configs/spmd/large_scale/covariance_strong.json b/configs/spmd/large_scale/covariance_strong.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "EmpiricalCovariance",
+                "estimator_methods": { "training": "fit" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "covariance": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json
@@ -7,15 +7,14 @@
                 "estimator_params": {
                     "algorithm": "lloyd"
                 },
-                "estimator_methods": { "training": "fit" }
-            },
-            "bench": {
-                "mpi_params": {"n": 48}
+                "estimator_methods": { "training": "fit", "inference": "predict" }
             }
 	},
 	"synthetic data": {
                 "data": [
-                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+			{ "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000,  "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }
                 ]
         }
     },
@@ -24,7 +23,7 @@
             "SETS": [
                 "synthetic data",
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
                 "spmd kmeans parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
@@ -15,9 +15,6 @@
 			"training": "fit",
 			"inference": "predict"
 		}
-	    },
-	    "bench": {
-	        "mpi_params": {}
 	    }
         },
         "synthetic classification data": {
@@ -35,7 +32,7 @@
                 "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale 2k parameters",
+		"large scale default parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
@@ -6,7 +6,16 @@
 		"distributed_split": "None"
             },
             "bench": {
-                "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+	"large scale strong parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
 	"large scale 2k parameters": {
@@ -15,7 +24,16 @@
                 "distributed_split": "None"
             },
             "bench": {
-                "mpi_params": {"n": [192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale strong 2k parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
 	"large scale impi parameters": {
@@ -24,7 +42,7 @@
                 "distributed_split": "no"
             },
             "bench": {
-		    "mpi_params": {"n": [1,2,4,6,12,24], "ppn": 12}
+		    "mpi_params": {"n": [1,2,4,6,9,12], "ppn": 12}
 	    }
 	}
     }
diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json
@@ -18,7 +18,7 @@
         "linreg": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
 		"synthetic data",
                 "spmd linear parameters"
             ]
diff --git a/configs/spmd/large_scale/linear_model_strong.json b/configs/spmd/large_scale/linear_model_strong.json
@@ -0,0 +1,26 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd linear parameters": {
+            "algorithm": {
+                "estimator": "LinearRegression",
+                "estimator_methods": { "training": "fit" }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 25005000,  "n_features": 100, "noise": 1.25 }, "split_kwargs": { "train_size": 25000000, "test_size": 5000 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd linear parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
@@ -0,0 +1,28 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd logreg2 parameters": {
+	    "algorithm":{
+		"estimator": "LogisticRegression",
+                "estimator_methods": { "inference": "predict" },
+		"estimator_params": { "max_iter": 30 }
+            }
+	},
+        "synthetic data": {
+            "data": [
+		{ "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+                "spmd logreg parameters",
+		"synthetic data",
+		"spmd logreg2 parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
@@ -21,7 +21,7 @@
         "linreg": {
             "SETS": [
 		"sklearnex spmd implementation",
-                "large scale default parameters",
+                "large scale 2k parameters",
                 "synthetic data",
 		"spmd pca parameters"
             ]
diff --git a/configs/spmd/large_scale/pca_strong.json b/configs/spmd/large_scale/pca_strong.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd pca parameters": {
+            "algorithm": {
+                "estimator": "PCA",
+                "estimator_methods": { "training": "fit", "inference": "" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 25000000,  "n_features": 100, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+		"sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+                "synthetic data",
+		"spmd pca parameters"
+            ]
+        }
+    }
+}
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
@@ -436,7 +436,9 @@ def measure_sklearn_estimator(
                 (
                     metrics[method]["time[ms]"],
                     metrics[method]["time std[ms]"],
-                    _,
+                    metrics[method]["first iter[ms]"],
+                    metrics[method]["box filter mean[ms]"],
+                    metrics[method]["box filter std[ms]"]
                 ) = measure_case(bench_case, method_instance, *data_args)
                 if batch_size is not None:
                     metrics[method]["throughput[samples/ms]"] = (
diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py
@@ -40,6 +40,22 @@ def box_filter(timing, left=0.2, right=0.8):
     return np.mean(result) * 1000, np.std(result) * 1000
 
 
+def large_scale_measurements(timing):
+    first_iter = timing[0] * 1000
+    mean = np.mean(timing[1:]) * 1000
+    stdev = np.std(timing[1:]) * 1000
+    timing_sorted = np.sort(timing)
+    Q1, Q3 = np.percentile(timing_sorted, [25, 75])
+    IQ = Q3 - Q1
+    lower, upper = Q1 - 1.5 * IQ, Q3 + 1.5 * IQ
+    
+    filtered_times = timing_sorted[(timing_sorted >= lower) & (timing_sorted <= upper)]
+    
+    box_filter_mean = np.mean(filtered_times) * 1000 if filtered_times.size > 0 else 0
+    box_filter_stdev = np.std(filtered_times) * 1000 if filtered_times.size > 0 else 0
+    return mean, stdev, first_iter, box_filter_mean, box_filter_stdev
+
+
 def measure_time(
     func,
     *args,
@@ -72,13 +88,14 @@ def measure_time(
                 f"exceeded time limit ({time_limit} seconds)"
             )
             break
-    mean, std = box_filter(times)
-    if std / mean > std_mean_ratio:
-        logger.warning(
-            f'Measured "std / mean" time ratio of "{str(func)}" function is higher '
-            f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})"
-        )
-    return mean, std, func_return_value
+    logger.debug(times)
+    #mean, std = box_filter(times)
+    #if std / mean > std_mean_ratio:
+    #    logger.warning(
+    #        f'Measured "std / mean" time ratio of "{str(func)}" function is higher '
+    #        f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})"
+    #    )
+    return large_scale_measurements(times)
 
 
 # wrapper to get measurement params from benchmarking case

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`	`"covariance": {`
`22`	`22`	`"SETS": [`
`23`	`23`	`"sklearnex spmd implementation",`
`24`		`- "large scale default parameters",`
	`24`	`+ "large scale 2k parameters",`
`25`	`25`	`"synthetic data",`
`26`	`26`	`"spmd basicstats parameters"`
`27`	`27`	`]`
Original file line number	Diff line number	Diff line change
`@@ -7,15 +7,14 @@`
`7`	`7`	`"estimator_params": {`
`8`	`8`	`"algorithm": "lloyd"`
`9`	`9`	`},`
`10`		`- "estimator_methods": { "training": "fit" }`
`11`		`- },`
`12`		`- "bench": {`
`13`		`- "mpi_params": {"n": 48}`
	`10`	`+ "estimator_methods": { "training": "fit", "inference": "predict" }`
`14`	`11`	`}`
`15`	`12`	`},`
`16`	`13`	`"synthetic data": {`
`17`	`14`	`"data": [`
`18`		`- { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }`
	`15`	`+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },`
	`16`	`+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 30000, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },`
	`17`	`+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 1000000, "n_features": 100, "centers": 100 }, "algorithm": { "n_clusters": 100, "max_iter": 100 } }`
`19`	`18`	`]`
`20`	`19`	`}`
`21`	`20`	`},`
`@@ -24,7 +23,7 @@`
`24`	`23`	`"SETS": [`
`25`	`24`	`"synthetic data",`
`26`	`25`	`"sklearnex spmd implementation",`
`27`		`- "large scale default parameters",`
	`26`	`+ "large scale 2k parameters",`
`28`	`27`	`"spmd kmeans parameters"`
`29`	`28`	`]`
`30`	`29`	`}`
Original file line number	Diff line number	Diff line change
`@@ -15,9 +15,6 @@`
`15`	`15`	`"training": "fit",`
`16`	`16`	`"inference": "predict"`
`17`	`17`	`}`
`18`		`- },`
`19`		`- "bench": {`
`20`		`- "mpi_params": {}`
`21`	`18`	`}`
`22`	`19`	`},`
`23`	`20`	`"synthetic classification data": {`
`@@ -35,7 +32,7 @@`
`35`	`32`	`"common knn parameters",`
`36`	`33`	`"synthetic classification data",`
`37`	`34`	`"sklearnex spmd implementation",`
`38`		`- "large scale 2k parameters",`
	`35`	`+ "large scale default parameters",`
`39`	`36`	`"spmd knn cls parameters"`
`40`	`37`	`]`
`41`	`38`	`}`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`"linreg": {`
`19`	`19`	`"SETS": [`
`20`	`20`	`"sklearnex spmd implementation",`
`21`		`- "large scale default parameters",`
	`21`	`+ "large scale 2k parameters",`
`22`	`22`	`"synthetic data",`
`23`	`23`	`"spmd linear parameters"`
`24`	`24`	`]`