Creating branch for large scale measurements

ethanglaser · ethanglaser · commit b3500dd94fe0 · 2024-08-30T23:27:21.000Z
diff --git a/configs/spmd/large_scale/basic_stats.json b/configs/spmd/large_scale/basic_stats.json
@@ -0,0 +1,30 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "BasicStatistics",
+                "estimator_methods": { "training": "compute" }
+            },
+	    "data": {
+		"split_kwargs": { "test_size": 0.0001 }
+	    }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+		{ "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/covariance.json b/configs/spmd/large_scale/covariance.json
@@ -0,0 +1,30 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd basicstats parameters": {
+            "algorithm": {
+                "estimator": "EmpiricalCovariance",
+                "estimator_methods": { "training": "fit" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "covariance": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+		"synthetic data",
+                "spmd basicstats parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/dbscan.json b/configs/spmd/large_scale/dbscan.json
@@ -0,0 +1,32 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd dbscan parameters": {
+	    "algorithm": {
+		"estimator": "DBSCAN",
+		"estimator_methods": {
+		    "training": "fit"
+		}
+	    },
+	    "data": {
+		"dtype": "float64"
+	    }
+	},
+	"synthetic dataset": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 10, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } }
+            ]
+	}
+    },
+    "TEMPLATES": {
+        "dbscan": {
+            "SETS": [
+                "common dbscan parameters",
+                "synthetic dataset",
+                "sklearnex spmd implementation",
+		"large scale default parameters",
+                "spmd dbscan parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json
@@ -0,0 +1,32 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/kmeans.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd kmeans parameters": {
+            "algorithm": {
+                "estimator": "KMeans",
+                "estimator_params": {
+                    "algorithm": "lloyd"
+                },
+                "estimator_methods": { "training": "fit" }
+            },
+            "bench": {
+                "mpi_params": {"n": 48}
+            }
+	},
+	"synthetic data": {
+                "data": [
+                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 5000000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
+                ]
+        }
+    },
+    "TEMPLATES": {
+        "kmeans": {
+            "SETS": [
+                "synthetic data",
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+                "spmd kmeans parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
@@ -0,0 +1,43 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/knn.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd knn cls parameters": {
+            "algorithm": {
+		"estimator": "KNeighborsClassifier",
+                "estimator_params": {
+                    "algorithm": "brute",
+                    "metric": "minkowski",
+                    "p": 2,
+                    "weights": "uniform",
+		    "n_neighbors": 5
+                },
+		"estimator_methods": {
+			"training": "fit",
+			"inference": "predict"
+		}
+	    },
+	    "bench": {
+	        "mpi_params": {}
+	    }
+        },
+        "synthetic classification data": {
+            "data": [
+		{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+		{ "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 100000,  "n_features": 1000, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 1000000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+            ]
+        }	
+    },
+    "TEMPLATES": {
+        "knn classifier": {
+            "SETS": [
+                "common knn parameters",
+                "synthetic classification data",
+                "sklearnex spmd implementation",
+		"large scale 2k parameters",
+                "spmd knn cls parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
@@ -0,0 +1,31 @@
+{
+    "PARAMETERS_SETS": {
+        "large scale default parameters": {
+            "data": {
+                "dtype": "float64",
+		"distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,3,4,5,6,7,8,9,10,11,12], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+	"large scale 2k parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+	"large scale impi parameters": {
+	    "data": {
+		"dtype": "float64",
+                "distributed_split": "no"
+            },
+            "bench": {
+		    "mpi_params": {"n": [1,2,4,6,12,24], "ppn": 12}
+	    }
+	}
+    }
+}
diff --git a/configs/spmd/large_scale/linear_model.json b/configs/spmd/large_scale/linear_model.json
@@ -0,0 +1,27 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/linear_model.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd linear parameters": {
+            "algorithm": {
+                "estimator": "LinearRegression",
+                "estimator_methods": { "training": "fit" }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_regression", "generation_kwargs": { "n_samples": 30005000,  "n_features": 10, "noise": 1.25 }, "split_kwargs": { "train_size": 30000000, "test_size": 5000 } },
+		{ "source": "make_regression", "generation_kwargs": { "n_samples": 305000,  "n_features": 1000, "noise": 1.25 }, "split_kwargs": { "train_size": 300000, "test_size": 5000 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale default parameters",
+		"synthetic data",
+                "spmd linear parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
@@ -0,0 +1,29 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/logreg.json", "../logreg.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd logreg2 parameters": {
+	    "algorithm":{
+		"estimator": "LogisticRegression",
+                "estimator_methods": { "inference": "predict" },
+		"estimator_params": { "max_iter": 20 }
+            }
+	},
+        "synthetic data": {
+            "data": [
+		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 2k parameters",
+                "spmd logreg parameters",
+		"synthetic data",
+		"spmd logreg2 parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/pca.json b/configs/spmd/large_scale/pca.json
@@ -0,0 +1,30 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/pca.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd pca parameters": {
+            "algorithm": {
+                "estimator": "PCA",
+                "estimator_methods": { "training": "fit", "inference": "" }
+            },
+            "data": {
+                "split_kwargs": { "test_size": 0.0001 }
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } },
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 100000,  "n_features": 1000, "centers": 1 } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "linreg": {
+            "SETS": [
+		"sklearnex spmd implementation",
+                "large scale default parameters",
+                "synthetic data",
+		"spmd pca parameters"
+            ]
+        }
+    }
+}
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
@@ -134,6 +134,9 @@ def get_subset_metrics_of_estimator(
                 and isinstance(iterations[0], Union[Numeric, NumpyNumeric].__args__)
             ):
                 metrics.update({"iterations": int(iterations[0])})
+        if hasattr(estimator_instance, "_n_inner_iter"):
+            inner_iters = estimator_instance._n_inner_iter
+            metrics.update({"inner_iters": int(inner_iters)})
     if task == "classification":
         y_pred = convert_to_numpy(estimator_instance.predict(x))
         metrics.update(
@@ -142,7 +145,7 @@ def get_subset_metrics_of_estimator(
                 "balanced accuracy": float(balanced_accuracy_score(y_compat, y_pred)),
             }
         )
-        if hasattr(estimator_instance, "predict_proba") and not (
+        '''if hasattr(estimator_instance, "predict_proba") and not (
             hasattr(estimator_instance, "probability")
             and getattr(estimator_instance, "probability") == False
         ):
@@ -162,7 +165,7 @@ def get_subset_metrics_of_estimator(
                     ),
                     "logloss": float(log_loss(y_compat, y_pred_proba)),
                 }
-            )
+            )'''
     elif task == "regression":
         y_pred = convert_to_numpy(estimator_instance.predict(x))
         metrics.update(
@@ -429,7 +432,6 @@ def measure_sklearn_estimator(
                         estimator_instance.get_booster()
                     )
                     method_instance = getattr(daal_model, method)
-
                 metrics[method] = dict()
                 (
                     metrics[method]["time[ms]"],
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
@@ -109,7 +109,8 @@ def split_and_transform_data(bench_case, data, data_description):
         y_train, y_test = None, None
 
     distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None)
-    if distributed_split == "rank_based":
+    knn_split_train = "KNeighbors" in get_bench_case_value(bench_case, "algorithm:estimator", "") and int(get_bench_case_value(bench_case, "bench:mpi_params:n", 1)) > 1
+    if distributed_split == "rank_based" or knn_split_train:
         from mpi4py import MPI
 
         comm = MPI.COMM_WORLD
@@ -129,10 +130,12 @@ def split_and_transform_data(bench_case, data, data_description):
                 x_train[train_start:train_end],
                 y_train[train_start:train_end],
             )
-            x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end]
+            if distributed_split == "rank_based":
+                x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end]
         else:
             x_train = x_train[train_start:train_end]
-            x_test = x_test[test_start:test_end]
+            if distributed_split == "rank_based":
+                x_test = x_test[test_start:test_end]
 
     device = get_bench_case_value(bench_case, "algorithm:device", None)
     common_data_format = get_bench_case_value(bench_case, "data:format", "pandas")
diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py
@@ -45,6 +45,9 @@ def generate_benchmark_command(
         mpi_prefix = "mpirun"
         for mpi_param_name, mpi_param_value in mpi_params.items():
             mpi_prefix += f" -{mpi_param_name} {mpi_param_value}"
+            if mpi_param_name == "-hostfile":
+                import os
+                mpi_prefix += os.environ.get("PBS_NODEFILE")
         command_prefix = f"{mpi_prefix} {command_prefix}"
     # 3. Intel(R) VTune* profiling command prefix
     vtune_profiling = get_bench_case_value(bench_case, "bench:vtune_profiling")