|
1 | 1 | import numpy as np |
| 2 | +import scipy.sparse as sp |
2 | 3 |
|
3 | 4 | from Orange.classification import Learner, Model |
4 | | -from Orange.data import Instance, Storage |
| 5 | +from Orange.data import Instance, Storage, Table |
5 | 6 | from Orange.statistics import contingency |
6 | 7 | from Orange.preprocess import Discretize, RemoveNaNColumns |
7 | 8 |
|
@@ -47,23 +48,81 @@ def __init__(self, log_cont_prob, class_prob, domain): |
47 | 48 | self.class_prob = class_prob |
48 | 49 |
|
49 | 50 | def predict_storage(self, data): |
| 51 | + if type(data) is Table: # pylint: disable=unidiomatic-typecheck |
| 52 | + return self.predict(data.X) |
| 53 | + |
50 | 54 | if isinstance(data, Instance): |
51 | | - data = [data] |
| 55 | + data = Table(data.domain, [data]) |
52 | 56 | if len(data.domain.attributes) == 0: |
53 | 57 | probs = np.tile(self.class_prob, (len(data), 1)) |
54 | 58 | else: |
55 | 59 | isnan = np.isnan |
56 | | - probs = np.exp( |
| 60 | + probs = np.atleast_2d(np.exp( |
57 | 61 | np.log(self.class_prob) + |
58 | 62 | np.array([np.zeros_like(self.class_prob) |
59 | 63 | if isnan(ins.x).all() else |
60 | | - np.sum(attr_prob[:, int(attr_val)] |
61 | | - for attr_val, attr_prob in zip(ins, self.log_cont_prob) |
62 | | - if not isnan(attr_val)) |
63 | | - for ins in data])) |
| 64 | + np.sum([attr_prob[:, int(attr_val)] |
| 65 | + for attr_val, attr_prob in |
| 66 | + zip(ins, self.log_cont_prob) |
| 67 | + if not isnan(attr_val)]) |
| 68 | + for ins in data]))) |
64 | 69 | probs /= probs.sum(axis=1)[:, None] |
65 | 70 | values = probs.argmax(axis=1) |
66 | 71 | return values, probs |
67 | 72 |
|
| 73 | + def predict(self, X): |
| 74 | + if not self.log_cont_prob: |
| 75 | + probs = self._priors(X) |
| 76 | + elif sp.issparse(X): |
| 77 | + probs = self._sparse_probs(X) |
| 78 | + else: |
| 79 | + probs = self._dense_probs(X) |
| 80 | + probs = np.exp(probs) |
| 81 | + probs /= probs.sum(axis=1)[:, None] |
| 82 | + values = probs.argmax(axis=1) |
| 83 | + return values, probs |
| 84 | + |
| 85 | + def _priors(self, data): |
| 86 | + return np.tile(np.log(self.class_prob), (data.shape[0], 1)) |
| 87 | + |
| 88 | + def _dense_probs(self, data): |
| 89 | + probs = self._priors(data) |
| 90 | + zeros = np.zeros((1, probs.shape[1])) |
| 91 | + for col, attr_prob in zip(data.T, self.log_cont_prob): |
| 92 | + col = col.copy() |
| 93 | + col[np.isnan(col)] = attr_prob.shape[1] - 1 |
| 94 | + col = col.astype(int) |
| 95 | + probs0 = np.vstack((attr_prob.T, zeros)) |
| 96 | + probs += probs0[col] |
| 97 | + return probs |
| 98 | + |
| 99 | + def _sparse_probs(self, data): |
| 100 | + probs = self._priors(data) |
| 101 | + |
| 102 | + n_vals = max(p.shape[1] for p in self.log_cont_prob) + 1 |
| 103 | + log_prob = np.zeros((len(self.log_cont_prob), |
| 104 | + n_vals, |
| 105 | + self.log_cont_prob[0].shape[0])) |
| 106 | + for i, p in enumerate(self.log_cont_prob): |
| 107 | + p0 = p.T[0].copy() |
| 108 | + probs[:] += p0 |
| 109 | + log_prob[i, :p.shape[1]] = p.T - p0 |
| 110 | + |
| 111 | + dat = data.data.copy() |
| 112 | + dat[np.isnan(dat)] = n_vals - 1 |
| 113 | + dat = dat.astype(int) |
| 114 | + |
| 115 | + if sp.isspmatrix_csr(data): |
| 116 | + for row, start, end in zip(probs, data.indptr, data.indptr[1:]): |
| 117 | + row += log_prob[data.indices[start:end], |
| 118 | + dat[start:end]].sum(axis=0) |
| 119 | + else: |
| 120 | + csc = data.tocsc() |
| 121 | + for start, end, attr_prob in zip(csc.indptr, csc.indptr[1:], |
| 122 | + log_prob): |
| 123 | + probs[csc.indices[start:end]] += attr_prob[dat[start:end]] |
| 124 | + |
| 125 | + return probs |
| 126 | + |
68 | 127 |
|
69 | 128 | NaiveBayesLearner.__returns__ = NaiveBayesModel |
0 commit comments