Skip to content

Commit 7d0f9ba

Browse files
Merge pull request #20 from bibikar/feature/dbscan
Add DBSCAN benchmarks
2 parents d735f55 + 2145f5e commit 7d0f9ba

File tree

6 files changed

+222
-4
lines changed

6 files changed

+222
-4
lines changed

daal4py/dbscan.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright (C) 2020 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
import argparse
6+
from bench import parse_args, time_mean_min, print_header, print_row, size_str
7+
from daal4py import dbscan
8+
from daal4py.sklearn.utils import getFPType
9+
import numpy as np
10+
11+
parser = argparse.ArgumentParser(description='daal4py DBSCAN clustering '
12+
'benchmark')
13+
parser.add_argument('-x', '--filex', '--fileX', '--input', required=True,
14+
type=str, help='Points to cluster')
15+
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10.,
16+
help='Radius of neighborhood of a point')
17+
parser.add_argument('-m', '--min-samples', default=5, type=int,
18+
help='The minimum number of samples required in a '
19+
'neighborhood to consider a point a core point')
20+
params = parse_args(parser, prefix='daal4py')
21+
22+
# Load generated data
23+
X = np.load(params.filex)
24+
25+
params.size = size_str(X.shape)
26+
params.dtype = X.dtype
27+
28+
29+
# Define functions to time
30+
def test_dbscan(X):
31+
algorithm = dbscan(
32+
fptype=getFPType(X),
33+
epsilon=params.eps,
34+
minObservations=params.min_samples,
35+
resultsToCompute='computeCoreIndices'
36+
)
37+
return algorithm.compute(X)
38+
39+
40+
columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
41+
'n_clusters', 'time')
42+
print_header(columns, params)
43+
44+
# Time clustering
45+
time, result = time_mean_min(test_dbscan, X,
46+
outer_loops=params.outer_loops,
47+
inner_loops=params.inner_loops,
48+
goal_outer_loops=params.goal,
49+
time_limit=params.time_limit,
50+
verbose=params.verbose)
51+
params.n_clusters = result.nClusters[0, 0]
52+
print_row(columns, params, function='DBSCAN', time=time)

native/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
# Copyright (C) 2018 Intel Corporation
1+
# Copyright (C) 2018-2020 Intel Corporation
22
#
33
# SPDX-License-Identifier: MIT
44

55
BENCHMARKS += distances kmeans linear ridge pca svm log_reg_lbfgs \
6-
decision_forest_regr decision_forest_clsf
6+
decision_forest_regr decision_forest_clsf dbscan
77
FOBJ = $(addprefix lbfgsb/,lbfgsb.o linpack.o timer.o)
88
CXXSRCS = $(addsuffix _bench.cpp,$(BENCHMARKS))
99

native/common.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,8 @@ void print_numeric_table(dm::NumericTablePtr X_nt, std::string label) {
420420
}
421421
std::cout << std::endl;
422422
}
423-
std::cout << std::setprecision(prec) << std::defaultfloat;
423+
std::cout << std::setprecision(prec);
424+
std::cout.unsetf(std::ios_base::floatfield);
424425

425426
X_nt->releaseBlockOfRows(blockX);
426427

native/dbscan_bench.cpp

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
/*
2+
* Copyright (C) 2020 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*/
6+
7+
#include <vector>
8+
#include <utility>
9+
#include <algorithm>
10+
#include <iostream>
11+
#include <fstream>
12+
#include <chrono>
13+
14+
#define DAAL_DATA_TYPE double
15+
#include "common.hpp"
16+
#include "CLI11.hpp"
17+
#include "daal.h"
18+
#include "npyfile.h"
19+
20+
21+
da::dbscan::ResultPtr
22+
dbscan_test(dm::NumericTablePtr X_nt, double eps, int min_samples) {
23+
24+
da::dbscan::Batch<double> algorithm(eps, min_samples);
25+
algorithm.input.set(da::dbscan::data, X_nt);
26+
algorithm.compute();
27+
28+
return algorithm.getResult();
29+
30+
}
31+
32+
33+
int main(int argc, char *argv[]) {
34+
35+
CLI::App app("Native benchmark for Intel(R) DAAL DBSCAN clustering");
36+
37+
std::string batch, arch, prefix;
38+
int num_threads;
39+
bool header, verbose;
40+
add_common_args(app, batch, arch, prefix, num_threads, header, verbose);
41+
42+
struct timing_options timing_opts = {100, 100, 10., 10};
43+
add_timing_args(app, "", timing_opts);
44+
45+
std::string filex, filei;
46+
app.add_option("-x,--filex,--fileX", filex,
47+
"Feature file name")
48+
->required()->check(CLI::ExistingFile);
49+
50+
double eps = 10.;
51+
app.add_option("-e,--eps,--epsilon", eps,
52+
"Radius of neighborhood of a point");
53+
54+
int min_samples = 5;
55+
app.add_option("-m,--min-samples", min_samples,
56+
"The minimum number of samples required in a neighborhood "
57+
"to consider a point a core point");
58+
59+
CLI11_PARSE(app, argc, argv);
60+
61+
// Set DAAL thread count
62+
int daal_threads = set_threads(num_threads);
63+
64+
// Load data
65+
struct npyarr *arrX = load_npy(filex.c_str());
66+
if (!arrX) {
67+
std::cerr << "Failed to load input array" << std::endl;
68+
return EXIT_FAILURE;
69+
}
70+
if (arrX->shape_len != 2) {
71+
std::cerr << "Expected 2 dimensions for X, found "
72+
<< arrX->shape_len << std::endl;
73+
return EXIT_FAILURE;
74+
}
75+
76+
// Infer data size from loaded arrays
77+
std::ostringstream stringSizeStream;
78+
stringSizeStream << arrX->shape[0] << 'x' << arrX->shape[1];
79+
std::string stringSize = stringSizeStream.str();
80+
81+
// Create numeric tables from input data
82+
dm::NumericTablePtr X_nt = make_table((double *) arrX->data,
83+
arrX->shape[0],
84+
arrX->shape[1]);
85+
86+
// Prepare meta-info
87+
std::string header_string = "Batch,Arch,Prefix,Threads,Size,Function,"
88+
"Clusters,Time";
89+
std::ostringstream meta_info_stream;
90+
meta_info_stream
91+
<< batch << ','
92+
<< arch << ','
93+
<< prefix << ','
94+
<< daal_threads << ','
95+
<< stringSize << ',';
96+
std::string meta_info = meta_info_stream.str();
97+
98+
// Actually time benches
99+
double time;
100+
da::dbscan::ResultPtr dbscan_result;
101+
std::tie(time, dbscan_result) = time_min<da::dbscan::ResultPtr> ([=] {
102+
return dbscan_test(X_nt, eps, min_samples);
103+
}, timing_opts, verbose);
104+
105+
// Get number of clusters found
106+
dm::NumericTablePtr n_clusters_nt
107+
= dbscan_result->get(da::dbscan::nClusters);
108+
dm::BlockDescriptor<int> n_clusters_block;
109+
n_clusters_nt->getBlockOfRows(0, 1, dm::readOnly, n_clusters_block);
110+
int n_clusters = n_clusters_block.getBlockPtr()[0];
111+
n_clusters_nt->releaseBlockOfRows(n_clusters_block);
112+
113+
std::cout << meta_info << "DBSCAN," << n_clusters << ',' << time
114+
<< std::endl;
115+
116+
return 0;
117+
}
118+

sklearn/bench.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def parse_args(parser, size=None, dtypes=None, loop_types=(),
121121

122122
n_jobs = None
123123
if n_jobs_supported and not daal_version:
124-
n_jobs = num_threads = params.num_threads
124+
n_jobs = num_threads = params.threads
125125

126126
# Set threading and DAAL related params here
127127
setattr(params, 'threads', num_threads)

sklearn/dbscan.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Copyright (C) 2020 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
import argparse
6+
from bench import parse_args, time_mean_min, print_header, print_row, size_str
7+
import numpy as np
8+
from sklearn.cluster import DBSCAN
9+
10+
parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark')
11+
parser.add_argument('-x', '--filex', '--fileX', '--input', required=True,
12+
type=str, help='Points to cluster')
13+
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10.,
14+
help='Radius of neighborhood of a point')
15+
parser.add_argument('-m', '--min-samples', default=5, type=int,
16+
help='The minimum number of samples required in a '
17+
'neighborhood to consider a point a core point')
18+
params = parse_args(parser, n_jobs_supported=True)
19+
20+
# Load generated data
21+
X = np.load(params.filex)
22+
23+
# Create our clustering object
24+
dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs,
25+
min_samples=params.min_samples, metric='euclidean',
26+
algorithm='auto')
27+
28+
# N.B. algorithm='auto' will select DAAL's brute force method when running
29+
# daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched
30+
# scikit-learn.
31+
32+
columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
33+
'n_clusters', 'time')
34+
params.size = size_str(X.shape)
35+
params.dtype = X.dtype
36+
print_header(columns, params)
37+
38+
# Time fit
39+
time, _ = time_mean_min(dbscan.fit, X,
40+
outer_loops=params.outer_loops,
41+
inner_loops=params.inner_loops,
42+
goal_outer_loops=params.goal,
43+
time_limit=params.time_limit,
44+
verbose=params.verbose)
45+
labels = dbscan.labels_
46+
params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
47+
print_row(columns, params, function='DBSCAN', time=time)

0 commit comments

Comments
 (0)