Skip to content

Commit e341b99

Browse files
committed
Add DAAL DBSCAN benchmark
1 parent 0bbcbee commit e341b99

File tree

4 files changed

+127
-15
lines changed

4 files changed

+127
-15
lines changed

daal4py/dbscan.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (C) 2017-2019 Intel Corporation
1+
# Copyright (C) 2020 Intel Corporation
22
#
33
# SPDX-License-Identifier: MIT
44

@@ -12,18 +12,15 @@
1212
'benchmark')
1313
parser.add_argument('-x', '--filex', '--fileX', '--input', required=True,
1414
type=str, help='Points to cluster')
15-
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10,
15+
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10.,
1616
help='Radius of neighborhood of a point')
17-
parser.add_argument('-m', '--data-multiplier', default=100,
18-
type=int, help='Data multiplier')
19-
parser.add_argument('-M', '--min-samples', default=5, type=int,
17+
parser.add_argument('-m', '--min-samples', default=5, type=int,
2018
help='The minimum number of samples required in a '
2119
'neighborhood to consider a point a core point')
2220
params = parse_args(parser, prefix='daal4py')
2321

2422
# Load generated data
2523
X = np.load(params.filex)
26-
X_mult = np.vstack((X,) * params.data_multiplier)
2724

2825
params.size = size_str(X.shape)
2926
params.dtype = X.dtype

native/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
# Copyright (C) 2018 Intel Corporation
1+
# Copyright (C) 2018-2020 Intel Corporation
22
#
33
# SPDX-License-Identifier: MIT
44

55
BENCHMARKS += distances kmeans linear ridge pca svm log_reg_lbfgs \
6-
decision_forest_regr decision_forest_clsf
6+
decision_forest_regr decision_forest_clsf dbscan
77
FOBJ = $(addprefix lbfgsb/,lbfgsb.o linpack.o timer.o)
88
CXXSRCS = $(addsuffix _bench.cpp,$(BENCHMARKS))
99

native/dbscan_bench.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
/*
2+
* Copyright (C) 2020 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*/
6+
7+
#include <vector>
8+
#include <utility>
9+
#include <algorithm>
10+
#include <iostream>
11+
#include <fstream>
12+
#include <chrono>
13+
14+
#define DAAL_DATA_TYPE double
15+
#include "common.hpp"
16+
#include "CLI11.hpp"
17+
#include "daal.h"
18+
#include "npyfile.h"
19+
20+
21+
da::dbscan::ResultPtr
22+
dbscan_test(dm::NumericTablePtr X_nt, double eps, int min_samples) {
23+
24+
da::dbscan::Batch<double> algorithm(eps, min_samples);
25+
algorithm.input.set(da::dbscan::data, X_nt);
26+
algorithm.compute();
27+
28+
return algorithm.getResult();
29+
30+
}
31+
32+
33+
int main(int argc, char *argv[]) {
34+
35+
CLI::App app("Native benchmark for Intel(R) DAAL DBSCAN clustering");
36+
37+
std::string batch, arch, prefix;
38+
int num_threads;
39+
bool header, verbose;
40+
add_common_args(app, batch, arch, prefix, num_threads, header, verbose);
41+
42+
struct timing_options timing_opts = {100, 100, 10., 10};
43+
add_timing_args(app, "", timing_opts);
44+
45+
std::string filex, filei;
46+
app.add_option("-x,--filex,--fileX", filex,
47+
"Feature file name")
48+
->required()->check(CLI::ExistingFile);
49+
50+
double eps = 10.;
51+
app.add_option("-e,--eps,--epsilon", eps,
52+
"Radius of neighborhood of a point");
53+
54+
int min_samples = 5;
55+
app.add_option("-m,--min-samples", min_samples,
56+
"The minimum number of samples required in a neighborhood "
57+
"to consider a point a core point");
58+
59+
CLI11_PARSE(app, argc, argv);
60+
61+
// Set DAAL thread count
62+
int daal_threads = set_threads(num_threads);
63+
64+
// Load data
65+
struct npyarr *arrX = load_npy(filex.c_str());
66+
if (!arrX) {
67+
std::cerr << "Failed to load input array" << std::endl;
68+
return EXIT_FAILURE;
69+
}
70+
if (arrX->shape_len != 2) {
71+
std::cerr << "Expected 2 dimensions for X, found "
72+
<< arrX->shape_len << std::endl;
73+
return EXIT_FAILURE;
74+
}
75+
76+
// Infer data size from loaded arrays
77+
std::ostringstream stringSizeStream;
78+
stringSizeStream << arrX->shape[0] << 'x' << arrX->shape[1];
79+
std::string stringSize = stringSizeStream.str();
80+
81+
// Create numeric tables from input data
82+
dm::NumericTablePtr X_nt = make_table((double *) arrX->data,
83+
arrX->shape[0],
84+
arrX->shape[1]);
85+
86+
// Prepare meta-info
87+
std::string header_string = "Batch,Arch,Prefix,Threads,Size,Function,"
88+
"Clusters,Time";
89+
std::ostringstream meta_info_stream;
90+
meta_info_stream
91+
<< batch << ','
92+
<< arch << ','
93+
<< prefix << ','
94+
<< daal_threads << ','
95+
<< stringSize << ',';
96+
std::string meta_info = meta_info_stream.str();
97+
98+
// Actually time benches
99+
double time;
100+
da::dbscan::ResultPtr dbscan_result;
101+
std::tie(time, dbscan_result) = time_min<da::dbscan::ResultPtr> ([=] {
102+
return dbscan_test(X_nt, eps, min_samples);
103+
}, timing_opts, verbose);
104+
105+
// Get number of clusters found
106+
dm::NumericTablePtr n_clusters_nt
107+
= dbscan_result->get(da::dbscan::nClusters);
108+
dm::BlockDescriptor<int> n_clusters_block;
109+
n_clusters_nt->getBlockOfRows(0, 1, dm::readOnly, n_clusters_block);
110+
int n_clusters = n_clusters_block.getBlockPtr()[0];
111+
n_clusters_nt->releaseBlockOfRows(n_clusters_block);
112+
113+
std::cout << meta_info << "DBSCAN," << n_clusters << time << std::endl;
114+
115+
return 0;
116+
}
117+

sklearn/dbscan.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (C) 2017-2019 Intel Corporation
1+
# Copyright (C) 2020 Intel Corporation
22
#
33
# SPDX-License-Identifier: MIT
44

@@ -10,26 +10,24 @@
1010
parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark')
1111
parser.add_argument('-x', '--filex', '--fileX', '--input', required=True,
1212
type=str, help='Points to cluster')
13-
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10,
13+
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10.,
1414
help='Radius of neighborhood of a point')
15-
parser.add_argument('-m', '--data-multiplier', default=100,
16-
type=int, help='Data multiplier')
17-
parser.add_argument('-M', '--min-samples', default=5, type=int,
15+
parser.add_argument('-m', '--min-samples', default=5, type=int,
1816
help='The minimum number of samples required in a '
1917
'neighborhood to consider a point a core point')
2018
params = parse_args(parser, n_jobs_supported=True)
2119

2220
# Load generated data
2321
X = np.load(params.filex)
24-
X_mult = np.vstack((X,) * params.data_multiplier)
2522

2623
# Create our clustering object
2724
dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs,
2825
min_samples=params.min_samples, metric='euclidean',
2926
algorithm='auto')
3027

3128
# N.B. algorithm='auto' will select DAAL's brute force method when running
32-
# daal4py-patched scikit-learn.
29+
# daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched
30+
# scikit-learn.
3331

3432
columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
3533
'n_clusters', 'time')

0 commit comments

Comments
 (0)