Skip to content

Commit 331f4cc

Browse files
committed
Update contingency to have show missing values for rows by column value
1 parent a6f06ee commit 331f4cc

File tree

9 files changed

+1837
-1208
lines changed

9 files changed

+1837
-1208
lines changed

Orange/classification/rules.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -562,8 +562,8 @@ def find_new_selectors(self, X, Y, W, domain, existing_selectors):
562562

563563
@staticmethod
564564
def discretize(X, Y, W, domain):
565-
values, counts, _ = _contingency.contingency_floatarray(
566-
X, Y.astype(dtype=np.intp), len(domain.class_var.values), W)
565+
(values, counts), _, _, _ = _contingency.contingency_floatarray(
566+
X, Y.astype(np.float64), len(domain.class_var.values), W)
567567
cut_ind = np.array(EntropyMDL._entropy_discretize_sorted(counts.T, True))
568568
return [values[smh] for smh in cut_ind]
569569

Orange/data/_contingency.c

Lines changed: 1585 additions & 1066 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Orange/data/_contingency.pyx

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#cython: embedsignature=True
2+
#cython: language_level=3
23

34
import numpy
45
cimport numpy as np
@@ -8,13 +9,15 @@ cdef extern from "numpy/npy_math.h":
89
bint npy_isnan(double x)
910

1011
@cython.wraparound(False)
11-
def contingency_floatarray(np.ndarray[np.float64_t, ndim=1] col_data, np.ndarray[np.intp_t, ndim=1] classes, np.intp_t n_rows, np.ndarray[np.float64_t, ndim=1] W = None):
12-
"""
12+
def contingency_floatarray(np.ndarray[np.float64_t, ndim=1] col_data, np.ndarray[np.float64_t, ndim=1] classes, np.intp_t n_rows, np.ndarray[np.float64_t, ndim=1] W = None):
13+
"""
1314
Given column values and class values, return
1415
- an array with the sorted list of values,
1516
- a 2D array with counts for the value (indexed by columns)
1617
and class value (indexed by rows),
17-
- and an array with the number of missing values for each class.
18+
- array with the number of missing values for each class.
19+
- array with the number of missing class values for each column value.
20+
- and the number of missing in class and column values at same time.
1821
"""
1922
cdef np.ndarray[np.intp_t, ndim=1] ranks = col_data.argsort()
2023
cdef int N = 0
@@ -32,22 +35,28 @@ def contingency_floatarray(np.ndarray[np.float64_t, ndim=1] col_data, np.ndarray
3235
cdef np.ndarray[np.float64_t, ndim=2] C = numpy.zeros((n_rows, N), dtype=numpy.float64)
3336
last = float("NaN")
3437
j = -1
35-
cdef Py_ssize_t tc
36-
cdef np.ndarray[np.float64_t, ndim=1] unknown = numpy.zeros(n_rows, dtype=numpy.float64)
38+
cdef np.float64_t tc
39+
cdef np.ndarray[np.float64_t, ndim=1] col_unknowns = numpy.zeros(n_rows, dtype=numpy.float64)
40+
cdef np.ndarray[np.float64_t, ndim=1] row_unknowns = numpy.zeros(N, dtype=numpy.float64)
41+
cdef np.float64_t unknowns = 0
3742
for i in range(ranks.shape[0]):
3843
i = ranks[i]
3944
v = col_data[i]
4045
tc = classes[i]
41-
if v != last and not npy_isnan(v):
46+
if npy_isnan(v) and npy_isnan(tc):
47+
unknowns += W[i] if weights else 1.
48+
elif npy_isnan(tc):
49+
row_unknowns[j] += W[i] if weights else 1.
50+
elif npy_isnan(v):
51+
col_unknowns[int(tc)] += W[i] if weights else 1.
52+
elif v != last:
4253
j += 1
4354
V[j] = v
4455
last = v
45-
C[tc,j] += W[i] if weights else 1.
46-
elif npy_isnan(v):
47-
unknown[tc] += W[i] if weights else 1.
56+
C[int(tc),j] += W[i] if weights else 1.
4857
else:
49-
C[tc,j] += W[i] if weights else 1.
58+
C[int(tc),j] += W[i] if weights else 1.
5059

5160
assert j == N-1
5261

53-
return V,C,unknown
62+
return (V,C),col_unknowns,row_unknowns,unknowns

Orange/data/sql/table.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -445,11 +445,12 @@ def _compute_contingency(self, col_vars=None, row_var=None):
445445
data = list(cur.fetchall())
446446
if column.is_continuous:
447447
all_contingencies[i] = \
448-
(self._continuous_contingencies(data, row), [])
448+
(self._continuous_contingencies(data, row), [], [], 0)
449449
else:
450450
all_contingencies[i] =\
451-
(self._discrete_contingencies(data, row, column), [])
452-
return all_contingencies, None
451+
(self._discrete_contingencies(data, row, column), [],
452+
[], 0)
453+
return all_contingencies
453454

454455
def _continuous_contingencies(self, data, row):
455456
values = np.zeros(len(data))

Orange/data/table.py

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import numpy as np
1313
from scipy import sparse as sp
1414

15-
from Orange.util import OrangeDeprecationWarning
1615
import Orange.data # import for io.py
1716
from Orange.data import (
1817
_contingency, _valuecount,
@@ -1429,7 +1428,6 @@ def _compute_contingency(self, col_vars=None, row_var=None):
14291428
row_data = self._Y[:, row_indi - n_atts]
14301429

14311430
W = self.W if self.has_weights() else None
1432-
nan_inds = None
14331431

14341432
col_desc = [self.domain[var] for var in col_vars]
14351433
col_indi = [self.domain.index(var) for var in col_vars]
@@ -1442,23 +1440,12 @@ def _compute_contingency(self, col_vars=None, row_var=None):
14421440
if row_data.dtype.kind != "f": #meta attributes can be stored as type object
14431441
row_data = row_data.astype(float)
14441442

1445-
unknown_rows = countnans(row_data)
1446-
if unknown_rows:
1447-
nan_inds = np.isnan(row_data)
1448-
row_data = row_data[~nan_inds]
1449-
if W:
1450-
W = W[~nan_inds]
1451-
unknown_rows = np.sum(W[nan_inds])
1452-
14531443
contingencies = [None] * len(col_desc)
14541444
for arr, f_cond, f_ind in (
14551445
(self.X, lambda i: 0 <= i < n_atts, lambda i: i),
14561446
(self._Y, lambda i: i >= n_atts, lambda i: i - n_atts),
14571447
(self.metas, lambda i: i < 0, lambda i: -1 - i)):
14581448

1459-
if nan_inds is not None:
1460-
arr = arr[~nan_inds]
1461-
14621449
arr_indi = [e for e, ind in enumerate(col_indi) if f_cond(ind)]
14631450

14641451
vars = [(e, f_ind(col_indi[e]), col_desc[e]) for e in arr_indi]
@@ -1468,12 +1455,13 @@ def _compute_contingency(self, col_vars=None, row_var=None):
14681455
max_vals = max(len(v[2].values) for v in disc_vars)
14691456
disc_indi = {i for _, i, _ in disc_vars}
14701457
mask = [i in disc_indi for i in range(arr.shape[1])]
1471-
conts, nans = contingency(arr, row_data, max_vals - 1,
1472-
n_rows - 1, W, mask)
1458+
conts, nans_cols, nans_rows, nans = contingency(
1459+
arr, row_data, max_vals - 1, n_rows - 1, W, mask)
14731460
for col_i, arr_i, var in disc_vars:
14741461
n_vals = len(var.values)
1475-
contingencies[col_i] = (conts[arr_i][:, :n_vals],
1476-
nans[arr_i])
1462+
contingencies[col_i] = (
1463+
conts[arr_i][:, :n_vals], nans_cols[arr_i],
1464+
nans_rows[arr_i], nans[arr_i])
14771465
else:
14781466
for col_i, arr_i, var in disc_vars:
14791467
contingencies[col_i] = contingency(
@@ -1482,28 +1470,26 @@ def _compute_contingency(self, col_vars=None, row_var=None):
14821470

14831471
cont_vars = [v for v in vars if v[2].is_continuous]
14841472
if cont_vars:
1485-
1486-
classes = row_data.astype(dtype=np.intp)
1473+
W_ = None
14871474
if W is not None:
1488-
W = W.astype(dtype=np.float64)
1475+
W_ = W.astype(dtype=np.float64)
14891476
if sp.issparse(arr):
14901477
arr = sp.csc_matrix(arr)
14911478

14921479
for col_i, arr_i, _ in cont_vars:
14931480
if sp.issparse(arr):
14941481
col_data = arr.data[arr.indptr[arr_i]:arr.indptr[arr_i + 1]]
14951482
rows = arr.indices[arr.indptr[arr_i]:arr.indptr[arr_i + 1]]
1496-
W_ = None if W is None else W[rows]
1497-
classes_ = classes[rows]
1483+
W_ = None if W_ is None else W_[rows]
1484+
classes_ = row_data[rows]
14981485
else:
1499-
col_data, W_, classes_ = arr[:, arr_i], W, classes
1486+
col_data, W_, classes_ = arr[:, arr_i], W_, row_data
15001487

15011488
col_data = col_data.astype(dtype=np.float64)
1502-
U, C, unknown = _contingency.contingency_floatarray(
1489+
contingencies[col_i] = _contingency.contingency_floatarray(
15031490
col_data, classes_, n_rows, W_)
1504-
contingencies[col_i] = ([U, C], unknown)
15051491

1506-
return contingencies, unknown_rows
1492+
return contingencies
15071493

15081494
@classmethod
15091495
def transpose(cls, table, feature_names_column="",

0 commit comments

Comments
 (0)