knn and forest config updates

ethanglaser · ethanglaser · commit 3cd955c3eec8 · 2024-09-21T05:25:44.000Z
diff --git a/configs/spmd/large_scale/forest.json b/configs/spmd/large_scale/forest.json
@@ -0,0 +1,26 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd forest classification parameters": {
+            "algorithm": {
+                "estimator": "RandomForestClassifier"
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 10, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 10000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 11000, "n_features": 1000, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale 2k parameters",
+		"synthetic data",
+                "spmd forest classification parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/forest_strong.json b/configs/spmd/large_scale/forest_strong.json
@@ -0,0 +1,25 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../spmd/stats_covariance.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd forest classification parameters": {
+            "algorithm": {
+                "estimator": "RandomForestClassifier"
+            }
+        },
+        "synthetic data": {
+            "data": [
+                { "source": "make_classification", "split_kwargs": { "train_size": 10000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 10001000, "n_features": 100, "n_classes": 2 },  "algorithm": { "estimator_params": { "n_estimators": 20, "max_depth": 4 } } }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "basicstats": {
+            "SETS": [
+                "sklearnex spmd implementation",
+                "large scale strong 2k parameters",
+		"synthetic data",
+                "spmd forest classification parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/knn.json b/configs/spmd/large_scale/knn.json
@@ -22,7 +22,7 @@
 		{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
 		{ "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 550000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
                 { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 },   "generation_kwargs": {  "n_samples": 100000,  "n_features": 1000, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } },
-                { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 500000 },   "generation_kwargs": {  "n_samples": 1000000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
+                { "source": "make_classification", "split_kwargs": { "train_size": 200000, "test_size": 200000 },   "generation_kwargs": {  "n_samples": 400000,  "n_features": 10, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }	
     },
@@ -32,7 +32,7 @@
                 "common knn parameters",
                 "synthetic classification data",
                 "sklearnex spmd implementation",
-		"large scale default parameters",
+		"large scale 2k parameters",
                 "spmd knn cls parameters"
             ]
         }
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
@@ -525,8 +525,8 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
     result_template = enrich_result(result_template, bench_case)
     if "assume_finite" in context_params:
         result_template["assume_finite"] = context_params["assume_finite"]
-    if hasattr(estimator_instance, "get_params"):
-        estimator_params = estimator_instance.get_params()
+    #if hasattr(estimator_instance, "get_params"):
+    #    estimator_params = estimator_instance.get_params()
     # note: "handle" is not JSON-serializable
     if "handle" in estimator_params:
         del estimator_params["handle"]
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
@@ -181,7 +181,7 @@ def split_and_transform_data(bench_case, data, data_description):
                 "format": data_format,
                 "order": data_order,
                 "dtype": data_dtype,
-                "samples": converted_data.shape[0],
+                "samples (per rank)": converted_data.shape[0],
             }
             if len(converted_data.shape) == 2 and converted_data.shape[1] > 1:
                 data_description[subset_name]["features"] = converted_data.shape[1]

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } },`
`23`	`23`	`{ "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 550000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } },`
`24`	`24`	`{ "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 50000 }, "generation_kwargs": { "n_samples": 100000, "n_features": 1000, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } },`
`25`		`- { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 500000 }, "generation_kwargs": { "n_samples": 1000000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }`
	`25`	`+ { "source": "make_classification", "split_kwargs": { "train_size": 200000, "test_size": 200000 }, "generation_kwargs": { "n_samples": 400000, "n_features": 10, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }`
`26`	`26`	`]`
`27`	`27`	`}`
`28`	`28`	`},`
`@@ -32,7 +32,7 @@`
`32`	`32`	`"common knn parameters",`
`33`	`33`	`"synthetic classification data",`
`34`	`34`	`"sklearnex spmd implementation",`
`35`		`- "large scale default parameters",`
	`35`	`+ "large scale 2k parameters",`
`36`	`36`	`"spmd knn cls parameters"`
`37`	`37`	`]`
`38`	`38`	`}`
Original file line number	Diff line number	Diff line change
`@@ -181,7 +181,7 @@ def split_and_transform_data(bench_case, data, data_description):`
`181`	`181`	`"format": data_format,`
`182`	`182`	`"order": data_order,`
`183`	`183`	`"dtype": data_dtype,`
`184`		`- "samples": converted_data.shape[0],`
	`184`	`+ "samples (per rank)": converted_data.shape[0],`
`185`	`185`	`}`
`186`	`186`	`if len(converted_data.shape) == 2 and converted_data.shape[1] > 1:`
`187`	`187`	`data_description[subset_name]["features"] = converted_data.shape[1]`