IntelPython
diff --git a/‎LICENSE
Lines changed: 1 addition & 1 deletion b/‎LICENSE
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile
Lines changed: 100 additions & 37 deletions b/‎Makefile
Lines changed: 100 additions & 37 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 74 deletions b/‎README.md
Lines changed: 1 addition & 74 deletions
diff --git a/‎daal4py/args.py
Lines changed: 28 additions & 0 deletions b/‎daal4py/args.py
Lines changed: 28 additions & 0 deletions
diff --git a/‎daal4py/bench.py
Lines changed: 27 additions & 0 deletions b/‎daal4py/bench.py
Lines changed: 27 additions & 0 deletions
diff --git a/‎daal4py/distances.py
Lines changed: 52 additions & 0 deletions b/‎daal4py/distances.py
Lines changed: 52 additions & 0 deletions
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2017-2018 Intel Corporation
+Copyright (c) 2017-2019 Intel Corporation
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -1,10 +1,12 @@
 # Sizes
 DISTANCES_SIZE = 1000x15000
 REGRESSION_SIZE = 1000000x50
-KMEANS_SIZE = $(REGRESSION_SIZE)
-SVM_VECTORS = 10000
+KMEANS_SAMPLES = 1000000
+KMEANS_FEATURES = 50
+KMEANS_SIZE = $(KMEANS_SAMPLES)x$(KMEANS_FEATURES)
+SVM_SAMPLES = 10000
 SVM_FEATURES = 1000
-ITERATIONS = ?
+ITERATIONS = 10
 
 # Bookkeeping options
 BATCH = $(shell date -Iseconds)
@@ -15,18 +17,21 @@ NUM_THREADS = -1
 SVM_NUM_THREADS = 0
 MULTIPLIER = 100
 DATA_DIR = data/
-KMEANS_DATA = $(addsuffix .csv,$(addprefix data/kmeans_,$(KMEANS_SIZE))) 
+KMEANS_DATA = data/kmeans_$(KMEANS_SIZE).npy
 
 comma = ,
 
 ifneq ($(CONDA_PREFIX),)
-    LD_LIBRARY_PATH := $(CONDA_PREFIX)/lib
+	LD_LIBRARY_PATH := $(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib
     export LD_LIBRARY_PATH
 endif
 
+export I_MPI_ROOT
 
 all: native python
 
+python: sklearn daal4py
+
 native: data
 	git submodule init && git submodule update
 	@echo "# Compiling native benchmarks"
@@ -40,56 +45,114 @@ native: data
 		$(NUM_THREADS) double $(REGRESSION_SIZE)
 	native/bin/linear $(BATCH) $(HOST) native linear \
 		$(NUM_THREADS) double $(REGRESSION_SIZE)
-	native/bin/kmeans $(BATCH) $(HOST) native kmeans.fit \
-		$(NUM_THREADS) double $(REGRESSION_SIZE) $(DATA_DIR)
-	native/bin/kmeans_predict $(BATCH) $(HOST) native kmeans.predict \
-		$(NUM_THREADS) double $(REGRESSION_SIZE) $(DATA_DIR) $(MULTIPLIER)
+	native/bin/kmeans $(BATCH) $(HOST) native kmeans \
+		$(NUM_THREADS) double $(KMEANS_SIZE) $(DATA_DIR) $(MULTIPLIER)
 	native/bin/two_class_svm \
-		--fileX data/two/X-$(SVM_VECTORS)x$(SVM_FEATURES).npy.csv \
-		--fileY data/two/y-$(SVM_VECTORS)x$(SVM_FEATURES).npy.csv \
-		--num-threads $(SVM_NUM_THREADS)
+		--fileX data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/two/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--num-threads $(SVM_NUM_THREADS) --header
 	native/bin/multi_class_svm \
-		--fileX data/multi/X-$(SVM_VECTORS)x$(SVM_FEATURES).npy.csv \
-		--fileY data/multi/y-$(SVM_VECTORS)x$(SVM_FEATURES).npy.csv \
-		--num-threads $(SVM_NUM_THREADS)
+		--fileX data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/multi/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--num-threads $(SVM_NUM_THREADS) --header
+	native/bin/log_reg_lbfgs \
+		--fileX data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/two/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--num-threads $(SVM_NUM_THREADS) --header
+	native/bin/log_reg_lbfgs \
+		--fileX data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/multi/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--num-threads $(SVM_NUM_THREADS) --header
+	native/bin/decision_forest_clsf \
+		--fileX data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/two/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--num-threads $(SVM_NUM_THREADS) --header
+	native/bin/decision_forest_clsf \
+		--fileX data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/multi/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--num-threads $(SVM_NUM_THREADS) --header
 
-python: data
-	@echo "# Running python benchmarks"
-	python python/distances.py --batchID $(BATCH) --arch $(HOST) \
+sklearn: data
+	@echo "# Running scikit-learn benchmarks"
+	python sklearn/distances.py --batchID $(BATCH) --arch $(HOST) \
 		--prefix python --core-number $(NUM_THREADS) \
 		--size $(subst x,$(comma),$(DISTANCES_SIZE)) --iteration $(ITERATIONS)
-	python python/ridge.py --batchID $(BATCH) --arch $(HOST) \
+	python sklearn/ridge.py --batchID $(BATCH) --arch $(HOST) \
 		--prefix python --core-number $(NUM_THREADS) \
 		--size $(subst x,$(comma),$(REGRESSION_SIZE)) --iteration $(ITERATIONS)
-	python python/linear.py --batchID $(BATCH) --arch $(HOST) \
+	python sklearn/linear.py --batchID $(BATCH) --arch $(HOST) \
 		--prefix python --core-number $(NUM_THREADS) \
 		--size $(subst x,$(comma),$(REGRESSION_SIZE)) --iteration $(ITERATIONS)
-	python python/kmeans.py --batchID $(BATCH) --arch $(HOST) \
+	python sklearn/kmeans.py --batchID $(BATCH) --arch $(HOST) \
 		--prefix python --core-number $(NUM_THREADS) \
 		--size $(subst x,$(comma),$(KMEANS_SIZE)) --iteration $(ITERATIONS) \
-		--input $(DATA_DIR)
-	python python/kmeans_predict.py --batchID $(BATCH) --arch $(HOST) \
+		-x $(KMEANS_DATA) -i $(basename $(KMEANS_DATA)).init.npy
+	python sklearn/svm_bench.py --core-number $(NUM_THREADS) \
+		--fileX data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/two/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--header
+	python sklearn/svm_bench.py --core-number $(NUM_THREADS) \
+		--fileX data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/multi/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--header
+	python sklearn/log_reg.py --num-threads $(NUM_THREADS) \
+		--fileX data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/two/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--header
+	python sklearn/log_reg.py --num-threads $(NUM_THREADS) \
+		--fileX data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/multi/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--header
+	python sklearn/df_clsf.py --num-threads $(NUM_THREADS) \
+		--fileX data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/two/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--header
+	python sklearn/df_clsf.py --num-threads $(NUM_THREADS) \
+		--fileX data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/multi/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--header
+
+daal4py: data
+	@echo "# Running daal4py benchmarks"
+	python daal4py/distances.py --batchID $(BATCH) --arch $(HOST) \
+		--prefix python --core-number $(NUM_THREADS) \
+		--size $(subst x,$(comma),$(DISTANCES_SIZE)) --iteration $(ITERATIONS)
+	python daal4py/ridge.py --batchID $(BATCH) --arch $(HOST) \
+		--prefix python --core-number $(NUM_THREADS) \
+		--size $(subst x,$(comma),$(REGRESSION_SIZE)) --iteration $(ITERATIONS)
+	python daal4py/linear.py --batchID $(BATCH) --arch $(HOST) \
+		--prefix python --core-number $(NUM_THREADS) \
+		--size $(subst x,$(comma),$(REGRESSION_SIZE)) --iteration $(ITERATIONS)
+	python daal4py/kmeans.py --batchID $(BATCH) --arch $(HOST) \
 		--prefix python --core-number $(NUM_THREADS) \
 		--size $(subst x,$(comma),$(KMEANS_SIZE)) --iteration $(ITERATIONS) \
-		--input $(DATA_DIR) --data-multiplier $(MULTIPLIER)
-	python python/svm_bench.py --core-number $(NUM_THREADS) \
-		--fileX data/two/X-$(SVM_VECTORS)x$(SVM_FEATURES).npy \
-		--fileY data/two/y-$(SVM_VECTORS)x$(SVM_FEATURES).npy
-	python python/svm_bench.py --core-number $(NUM_THREADS) \
-		--fileX data/multi/X-$(SVM_VECTORS)x$(SVM_FEATURES).npy \
-		--fileY data/multi/y-$(SVM_VECTORS)x$(SVM_FEATURES).npy
+		-x $(KMEANS_DATA) -i $(basename $(KMEANS_DATA)).init.npy
+	python daal4py/svm_bench.py --core-number $(NUM_THREADS) \
+		--fileX data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/two/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--header
+	python daal4py/svm_bench.py --core-number $(NUM_THREADS) \
+		--fileX data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--fileY data/multi/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+		--header
 
 data: $(KMEANS_DATA) svm_data
 
 $(KMEANS_DATA): | data/
-	python python/kmeans_data.py --size \
-		$(shell basename $@ .csv | cut -d _ -f 2) --fname $@ --clusters 10
+	python make_datasets.py -f $(KMEANS_FEATURES) -s $(KMEANS_SAMPLES) \
+		kmeans -c 10 -x $(basename $@) -i $(basename $@).init \
+		-t $(basename $@).tol
+
+svm_data: data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
+	data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy
 
-svm_data: data/two/X-$(SVM_VECTORS)x$(SVM_FEATURES).npy.csv
+data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy: | data/
+	python make_datasets.py -f $(SVM_FEATURES) -s $(SVM_SAMPLES) \
+		classification -c 2 -x $@ -y $(dir $@)/$(subst X-,y-,$(notdir $@))
 
-data/two/X-$(SVM_VECTORS)x$(SVM_FEATURES).npy.csv: | data/
-	python python/svm_data.py -v $(SVM_VECTORS) -f $(SVM_FEATURES)
-	native/svm_native_data.sh
+data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy: | data/
+	python make_datasets.py -f $(SVM_FEATURES) -s $(SVM_SAMPLES) \
+		classification -c 5 -x $@ -y $(dir $@)/$(subst X-,y-,$(notdir $@))
 
 data/:
 	mkdir -p data/
@@ -100,4 +163,4 @@ clean:
 	$(MAKE) -C native clean
 	rm -rf data
 
-.PHONY: native python all clean native_data data kmeans_data svm_data
+.PHONY: native python sklearn daal4py all clean native_data data kmeans_data svm_data
@@ -1,6 +1,6 @@
 # scikit-learn_bench
 
-Benchmark for optimizations to scikit-learn in the Intel Distribution for
+Benchmark for optimizations to scikit-learn in the Intel(R) Distribution for
 Python*
 
 ## Prerequisites
@@ -13,76 +13,3 @@ Python*
   - To run only native benchmarks, use `make native`.
   - If you have activated a conda environment, the build will use daal from
     the conda environment, if available.
-
-## Manually
-
-### Build
-
-- `git submodule init && git submodule update` (for native versions only)
-- To build native versions, run `make -C native`.
-- Prepare data for KMeans benchmarks by running
-  `mkdir -p data && python python/kmeans_data.py --size 1000000x50 --fname data/kmeans_1000000x50.csv --clusters 10`
-  - Size can be adjusted. Example sizes are `500000x5`, `500000x25`, `1000000x50`.
-- Prepare data for SVM benchmarks by running
-  `python python/svm_data.py -v 10000 -f 1000`.
-  - Number of vectors can be specified with `-v`, and number of features
-    can be specified with `-f`. Data will by default go in `data/two` and `data/multi`.
-- Prepare data for native benchmarks by running `native/svm_native_data.sh`.
-
-### Run
-- All benchmarks must be given the number of threads to run. If this value
-  is `-1`, then the number of processing threads will equal the number of
-  CPUs available on the system. Otherwise, the benchmark will use the given
-  number of threads.
-- For KMeans benchmarks, an input directory must be specified.
-  The slash must be included at the end of the directory name.
-- For all benchmarks, the `batch`, `hostname`, and `env_name`
-  arguments are only for bookkeeping purposes and can be replaced with
-  placeholders.
-- The KMeans predict benchmark has a multiplier argument. An example value
-  is 100.
-
-#### Python benchmarks
-- Python benchmarks are located in the `python` directory
-  `python python/<benchmark>.py <args...>`
-- The following benchmarks are available:
-  - `distances`: benchmark pairwise distances using `cosine` and `correlation`
-    metrics
-  - `ridge`: benchmark ridge regression fit and prediction
-  - `linear`: benchmark linear regression fit and prediction
-  - `kmeans`: benchmark KMeans fit
-  - `kmeans_predict`: benchmark KMeans predict
-  - `svm_bench`: benchmark two- and multi-class SVM
-- A size must be passed in the form `--size M,N` for all benchmarks except SVM
-- The number of threads to run must be passed in the form `--core-number T`
-  for all benchmarks.
-- For KMeans benchmarks, the input directory must be passed in the form
-  `--input INPUT_DIR`.
-- For SVM benchmarks, the input files must be passed in the form
-  `--fileX FILE_X --fileY FILE_Y`.
-- For the KMeans predict benchmark, the multiplier must be passed in the form
-  `--data-multiplier X`.
-
-#### Native benchmarks
-- Binaries are located in the `native/bin` directory.
-- Sizes must be specified in `MxN` form.
-- The following benchmarks are available:
-  - `cosine <batch> <hostname> <env_name> cosine <threads> double <size>`:
-    benchmark pairwise distances using `cosine` metric
-  - `correlation <batch> <hostname> <env_name> cosine <threads> double <size>`:
-    benchmark pairwise distances using `correlation` metric
-  - `ridge <batch> <hostname> <env_name> cosine <threads> double <size>`:
-    benchmark ridge regression fit and prediction
-  - `linear <batch> <hostname> <env_name> cosine <threads> double <size>`:
-    benchmark linear regression fit and prediction
-  - `kmeans <batch> <hostname> <env_name> cosine <threads> double <size> <input_dir>`:
-    benchmark KMeans fit, finding pregenerated input files in `input_dir`
-  - `kmeans_predict <batch> <hostname> <env_name> cosine <threads> double <size> <input_dir> <multiplier>`
-    benchmark KMeans fit, finding pregenerated input files in `input_dir`.
-    A possible value for `multiplier` is 100.
-  - `{two,multi}_class_svm --fileX <feature-file> --fileY <label-file> --num-threads <threads>`:
-    benchmark two/multi class SVM, using pregenerated feature and label file and using the given
-    number of threads
-
-## See also
-"[Accelerating Scientific Python with Intel Optimizations](http://conference.scipy.org/proceedings/scipy2017/pdfs/oleksandr_pavlyk.pdf)" by Oleksandr Pavlyk, Denis Nagorny, Andres Guzman-Ballen, Anton Malakhov, Hai Liu, Ehsan Totoni, Todd A. Anderson, Sergey Maidanov. Proceedings of the 16th Python in Science Conference (SciPy 2017), July 10 - July 16, Austin, Texas
@@ -0,0 +1,28 @@
+# Copyright (C) 2017-2019 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+
+def getArguments(argParser):
+    argParser.add_argument('--iteration', default=10, type=int,
+                           help='Number of repetitions to run')
+    argParser.add_argument('--num-threads', '--core-number', default=-1,
+                           type=int, help='Number of threads to use')
+    argParser.add_argument('--arch', default='?',
+                           help='Machine architecture, for bookkeeping')
+    argParser.add_argument('--batchID', default='?',
+                           help='Batch ID, for bookkeeping')
+    argParser.add_argument('--prefix', default='?',
+                           help='Prefix string, for bookkeeping')
+    argParser.add_argument('--place',       default='?',       help="prefix string")
+    argParser.add_argument('--cache',       default='?',       help="cached/non-cached")
+    argParser.add_argument('--size', default='?',
+                           help="array size, delimited by comma or 'x'")
+    args = argParser.parse_args()
+
+    args.size = [int(n) for n in args.size.replace('x', ',').split(',')]
+    return args
+
+
+def coreString(num):
+    return 'Serial' if num == 1 else 'Threaded'
@@ -0,0 +1,27 @@
+# Copyright (C) 2018-2019 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+
+def set_daal_num_threads(num_threads):
+    try:
+        import daal4py
+        if num_threads:
+            daal4py.daalinit(nthreads=num_threads)
+    except ImportError:
+        print("@ Package 'daal4py' was not found. Number of threads is being ignored")
+
+
+def prepare_benchmark(args):
+    try:
+        if args.num_threads > 0:
+            set_daal_num_threads(args.num_threads)
+        num_threads = args.num_threads
+        import daal4py
+        daal_version = daal4py.__daal_run_version__
+    except ImportError:
+        num_threads = 1
+        daal_version = None
+
+    return num_threads, daal_version
+
@@ -0,0 +1,52 @@
+# Copyright (C) 2017-2019 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from __future__ import print_function
+import numpy as np
+import timeit
+from numpy.random import rand
+from daal4py import correlation_distance, cosine_distance, daalinit
+from args import getArguments, coreString
+from bench import prepare_benchmark
+
+import argparse
+argParser = argparse.ArgumentParser(prog="pairwise_distances.py",
+                                    description="sklearn pairwise_distances benchmark",
+                                    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+args = getArguments(argParser)
+REP = args.iteration if args.iteration != '?' else 10
+core_number, daal_version = prepare_benchmark(args)
+
+
+def st_time(func):
+    def st_func(*args, **keyArgs):
+        times = []
+        for n in range(REP):
+            t1 = timeit.default_timer()
+            r = func(*args, **keyArgs)
+            t2 = timeit.default_timer()
+            times.append(t2-t1)
+        print(min(times))
+        return r
+    return st_func
+
+p = args.size[0]
+n = args.size[1]
+
+
+X = rand(p,n)
+
+
+@st_time
+def cosine(X):
+    cos_dist = cosine_distance().compute(X)
+@st_time
+def correlation(X):
+    cor_dist = correlation_distance().compute(X)
+
+print (','.join([args.batchID, args.arch, args.prefix, "Cosine", coreString(args.num_threads), "Double", "%sx%s" % (p,n)]), end=',')
+cosine(X)
+print (','.join([args.batchID, args.arch, args.prefix, "Correlation", coreString(args.num_threads), "Double", "%sx%s" % (p,n)]), end=',')
+correlation(X)