Skip to content

Commit e87e812

Browse files
committed
FIX #213, name creation via hashing for sparse data
1 parent 6068f54 commit e87e812

File tree

3 files changed

+36
-15
lines changed

3 files changed

+36
-15
lines changed

autosklearn/automl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
pipeline
2727
from autosklearn.ensemble_builder import EnsembleBuilder
2828
from autosklearn.smbo import AutoMLSMBO
29-
from autosklearn.util.hash import hash_numpy_array
29+
from autosklearn.util.hash import hash_array_or_matrix
3030

3131

3232
def _model_predict(self, X, batch_size, identifier):
@@ -158,7 +158,7 @@ def fit(self, X, y,
158158
self._backend.context.create_directories()
159159

160160
if dataset_name is None:
161-
dataset_name = hash_numpy_array(X)
161+
dataset_name = hash_array_or_matrix(X)
162162

163163
self._backend.save_start_time(self._seed)
164164
self._stopwatch = StopWatch()

autosklearn/util/hash.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,23 @@
11
import hashlib
22

3+
import scipy.sparse
34

4-
def hash_numpy_array(X):
5+
6+
def hash_array_or_matrix(X):
57
m = hashlib.md5()
68

7-
if X.flags['C_CONTIGUOUS']:
9+
if scipy.sparse.issparse(X):
10+
m.update(X.indices)
11+
m.update(X.indptr)
812
m.update(X.data)
13+
m.update(str(X.shape).encode('utf8'))
914
else:
10-
m.update(X.T.data)
15+
if X.flags['C_CONTIGUOUS']:
16+
m.update(X.data)
17+
m.update(str(X.shape).encode('utf8'))
18+
else:
19+
m.update(X.T.data)
20+
m.update(str(X.T.shape).encode('utf8'))
1121

1222
hash = m.hexdigest()
1323
return hash

test/test_util/test_hash.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,25 @@
11
import unittest
22

33
import numpy as np
4+
import scipy.sparse
45

5-
from autosklearn.util.hash import hash_numpy_array
6+
from autosklearn.util.hash import hash_array_or_matrix
67

78

89
class HashTests(unittest.TestCase):
910

1011
def test_c_contiguous_array(self):
1112
array = np.array([[1, 2], [3, 4]])
1213

13-
hash = hash_numpy_array(array)
14+
hash = hash_array_or_matrix(array)
1415

1516
self.assertIsNotNone(hash)
1617

1718
def test_f_contiguous_array(self):
1819
array = np.array([[1, 2], [3, 4]])
1920
array = np.asfortranarray(array)
2021

21-
hash = hash_numpy_array(array)
22+
hash = hash_array_or_matrix(array)
2223

2324
self.assertIsNotNone(hash)
2425

@@ -27,25 +28,35 @@ def test_transpose_arrays(self):
2728
f_array = np.array([[1, 3], [2, 4]])
2829
f_array = np.asfortranarray(f_array)
2930

30-
c_hash = hash_numpy_array(c_array)
31-
f_hash = hash_numpy_array(f_array)
31+
c_hash = hash_array_or_matrix(c_array)
32+
f_hash = hash_array_or_matrix(f_array)
3233

3334
self.assertEqual(c_hash, f_hash)
3435

3536
def test_same_data_arrays(self):
3637
first_array = np.array([[1, 2], [3, 4]])
3738
second_array = np.array([[1, 2], [3, 4]])
3839

39-
first_hash = hash_numpy_array(first_array)
40-
second_hash = hash_numpy_array(second_array)
40+
first_hash = hash_array_or_matrix(first_array)
41+
second_hash = hash_array_or_matrix(second_array)
4142

4243
self.assertEqual(first_hash, second_hash)
4344

4445
def test_different_data_arrays(self):
4546
first_array = np.array([[1, 2], [3, 4]])
4647
second_array = np.array([[1, 3], [2, 4]])
4748

48-
first_hash = hash_numpy_array(first_array)
49-
second_hash = hash_numpy_array(second_array)
49+
first_hash = hash_array_or_matrix(first_array)
50+
second_hash = hash_array_or_matrix(second_array)
5051

51-
self.assertNotEqual(first_hash, second_hash)
52+
self.assertNotEqual(first_hash, second_hash)
53+
54+
def test_scipy_csr(self):
55+
row = np.array([0, 0, 1, 2, 2, 2])
56+
col = np.array([0, 2, 2, 0, 1, 2])
57+
data = np.array([1, 2, 3, 4, 5, 6])
58+
matrix = scipy.sparse.csr_matrix((data, (row, col)), shape=(3, 3))
59+
60+
hash = hash_array_or_matrix(matrix)
61+
62+
self.assertIsNotNone(hash)

0 commit comments

Comments
 (0)