dbscan large scale support and logreg details

ethanglaser · ethanglaser · commit 80257199e245 · 2024-10-07T21:45:55.000Z
diff --git a/configs/spmd/large_scale/dbscan_strong.json b/configs/spmd/large_scale/dbscan_strong.json
@@ -0,0 +1,32 @@
+{
+    "INCLUDE": ["../../common/sklearn.json", "../../regular/dbscan.json", "large_scale.json"],
+    "PARAMETERS_SETS": {
+        "spmd dbscan parameters": {
+	    "algorithm": {
+		"estimator": "DBSCAN",
+		"estimator_methods": {
+		    "training": "fit"
+		}
+	    },
+	    "data": {
+		"dtype": "float64"
+	    }
+	},
+	"synthetic dataset": {
+            "data": [
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 400000,  "n_features": 100, "centers": 10 }, "algorithm": { "eps": 5, "min_samples": 5 } }
+            ]
+	}
+    },
+    "TEMPLATES": {
+        "dbscan": {
+            "SETS": [
+                "common dbscan parameters",
+                "synthetic dataset",
+                "sklearnex spmd implementation",
+		"large scale strong parameters",
+                "spmd dbscan parameters"
+            ]
+        }
+    }
+}
diff --git a/configs/spmd/large_scale/large_scale.json b/configs/spmd/large_scale/large_scale.json
@@ -27,6 +27,24 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+        "large scale <64 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
+        "large scale >64 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "None"
+            },
+            "bench": {
+                "mpi_params": {"n": [768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
         "large scale strong 2k parameters": {
             "data": {
                 "dtype": "float64",
@@ -36,6 +54,15 @@
                 "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
             }
         },
+        "large scale strong <64 parameters": {
+            "data": {
+                "dtype": "float64",
+                "distributed_split": "rank_based"
+            },
+            "bench": {
+                "mpi_params": {"n": [1,2,6,12,24,48,96,192,384,768], "ppn": 12, "-hostfile": "", "-cpu-bind=list:0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203": "--envall gpu_tile_compact.sh" }
+            }
+        },
 	"large scale impi parameters": {
 	    "data": {
 		"dtype": "float64",
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
@@ -5,7 +5,7 @@
 	    "algorithm":{
 		"estimator": "LogisticRegression",
                 "estimator_methods": { "inference": "predict" },
-		"estimator_params": { "max_iter": 30 }
+		"estimator_params": { "max_iter": 16 }
             }
 	},
         "synthetic data": {
diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py
@@ -72,12 +72,16 @@ def measure_time(
         )
     times = []
     func_return_value = None
+    inners, iters = [], []
     while len(times) < n_runs:
         if enable_itt and itt_is_available:
             itt.resume()
         t0 = timeit.default_timer()
         func_return_value = func(*args, **kwargs)
         t1 = timeit.default_timer()
+        if hasattr(func.__self__, "_n_inner_iter"):
+            inners.append(func.__self__._n_inner_iter)
+            iters.append(func.__self__.n_iter_)
         if enable_itt and itt_is_available:
             itt.pause()
         times.append(t1 - t0)
@@ -88,6 +92,9 @@ def measure_time(
                 f"exceeded time limit ({time_limit} seconds)"
             )
             break
+    from mpi4py import MPI
+    if MPI.COMM_WORLD.Get_rank() == 0:
+        logger.debug("iters across n runs: " + str(iters) + ", inner iters across n runs: " + str(inners))
     logger.debug(times)
     #mean, std = box_filter(times)
     #if std / mean > std_mean_ratio:

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`"algorithm":{`
`6`	`6`	`"estimator": "LogisticRegression",`
`7`	`7`	`"estimator_methods": { "inference": "predict" },`
`8`		`- "estimator_params": { "max_iter": 30 }`
	`8`	`+ "estimator_params": { "max_iter": 16 }`
`9`	`9`	`}`
`10`	`10`	`},`
`11`	`11`	`"synthetic data": {`