|
1 | 1 | import numpy as np |
| 2 | +import scipy.sparse as sp |
2 | 3 |
|
3 | 4 | from Orange.classification import Learner, Model |
4 | 5 | from Orange.data import Instance, Storage |
@@ -48,22 +49,62 @@ def __init__(self, log_cont_prob, class_prob, domain): |
48 | 49 |
|
49 | 50 | def predict_storage(self, data): |
50 | 51 | if isinstance(data, Instance): |
51 | | - data = [data] |
52 | | - if len(data.domain.attributes) == 0: |
53 | | - probs = np.tile(self.class_prob, (len(data), 1)) |
| 52 | + data = np.atleast_2d(data.x) |
| 53 | + elif isinstance(data, Storage): |
| 54 | + data = data.X |
| 55 | + |
| 56 | + if not self.log_cont_prob: |
| 57 | + probs = self._priors(data) |
| 58 | + elif sp.issparse(data): |
| 59 | + probs = self._sparse_probs(data) |
54 | 60 | else: |
55 | | - isnan = np.isnan |
56 | | - probs = np.exp( |
57 | | - np.log(self.class_prob) + |
58 | | - np.array([np.zeros_like(self.class_prob) |
59 | | - if isnan(ins.x).all() else |
60 | | - np.sum(attr_prob[:, int(attr_val)] |
61 | | - for attr_val, attr_prob in zip(ins, self.log_cont_prob) |
62 | | - if not isnan(attr_val)) |
63 | | - for ins in data])) |
| 61 | + probs = self._dense_probs(data) |
| 62 | + probs = np.exp(probs) |
64 | 63 | probs /= probs.sum(axis=1)[:, None] |
65 | 64 | values = probs.argmax(axis=1) |
66 | 65 | return values, probs |
67 | 66 |
|
| 67 | + def _priors(self, data): |
| 68 | + return np.tile(np.log(self.class_prob), (data.shape[0], 1)) |
| 69 | + |
| 70 | + def _dense_probs(self, data): |
| 71 | + probs = self._priors(data) |
| 72 | + zeros = np.zeros((1, probs.shape[1])) |
| 73 | + for col, attr_prob in zip(data.T, self.log_cont_prob): |
| 74 | + col = col.copy() |
| 75 | + col[np.isnan(col)] = attr_prob.shape[1] - 1 |
| 76 | + col = col.astype(int) |
| 77 | + probs0 = np.vstack((attr_prob.T, zeros)) |
| 78 | + probs += probs0[col] |
| 79 | + return probs |
| 80 | + |
| 81 | + def _sparse_probs(self, data): |
| 82 | + probs = self._priors(data) |
| 83 | + |
| 84 | + n_vals = max(p.shape[1] for p in self.log_cont_prob) + 1 |
| 85 | + log_prob = np.zeros((len(self.log_cont_prob), |
| 86 | + n_vals, |
| 87 | + self.log_cont_prob[0].shape[0])) |
| 88 | + for i, p in enumerate(self.log_cont_prob): |
| 89 | + p0 = p.T[0].copy() |
| 90 | + probs[:] += p0 |
| 91 | + log_prob[i, :p.shape[1]] = p.T - p0 |
| 92 | + |
| 93 | + dat = data.data.copy() |
| 94 | + dat[np.isnan(dat)] = n_vals - 1 |
| 95 | + dat = dat.astype(int) |
| 96 | + |
| 97 | + if sp.isspmatrix_csr(data): |
| 98 | + for row, start, end in zip(probs, data.indptr, data.indptr[1:]): |
| 99 | + row += log_prob[data.indices[start:end], |
| 100 | + dat[start:end]].sum(axis=0) |
| 101 | + else: |
| 102 | + csc = data.tocsc() |
| 103 | + for start, end, attr_prob in zip(csc.indptr, csc.indptr[1:], |
| 104 | + log_prob): |
| 105 | + probs[csc.indices[start:end]] += attr_prob[dat[start:end]] |
| 106 | + |
| 107 | + return probs |
| 108 | + |
68 | 109 |
|
69 | 110 | NaiveBayesLearner.__returns__ = NaiveBayesModel |
0 commit comments