Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Orange/classification/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,8 +562,8 @@ def find_new_selectors(self, X, Y, W, domain, existing_selectors):

@staticmethod
def discretize(X, Y, W, domain):
values, counts, _ = _contingency.contingency_floatarray(
X, Y.astype(dtype=np.intp), len(domain.class_var.values), W)
(values, counts), _, _, _ = _contingency.contingency_floatarray(
X, Y.astype(np.float64), len(domain.class_var.values), W)
cut_ind = np.array(EntropyMDL._entropy_discretize_sorted(counts.T, True))
return [values[smh] for smh in cut_ind]

Expand Down
2,651 changes: 1,585 additions & 1,066 deletions Orange/data/_contingency.c

Large diffs are not rendered by default.

31 changes: 20 additions & 11 deletions Orange/data/_contingency.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#cython: embedsignature=True
#cython: language_level=3

import numpy
cimport numpy as np
Expand All @@ -8,13 +9,15 @@ cdef extern from "numpy/npy_math.h":
bint npy_isnan(double x)

@cython.wraparound(False)
def contingency_floatarray(np.ndarray[np.float64_t, ndim=1] col_data, np.ndarray[np.intp_t, ndim=1] classes, np.intp_t n_rows, np.ndarray[np.float64_t, ndim=1] W = None):
"""
def contingency_floatarray(np.ndarray[np.float64_t, ndim=1] col_data, np.ndarray[np.float64_t, ndim=1] classes, np.intp_t n_rows, np.ndarray[np.float64_t, ndim=1] W = None):
"""
Given column values and class values, return
- an array with the sorted list of values,
- a 2D array with counts for the value (indexed by columns)
and class value (indexed by rows),
- and an array with the number of missing values for each class.
- array with the number of missing values for each class.
- array with the number of missing class values for each column value.
- and the number of missing in class and column values at same time.
"""
cdef np.ndarray[np.intp_t, ndim=1] ranks = col_data.argsort()
cdef int N = 0
Expand All @@ -32,22 +35,28 @@ def contingency_floatarray(np.ndarray[np.float64_t, ndim=1] col_data, np.ndarray
cdef np.ndarray[np.float64_t, ndim=2] C = numpy.zeros((n_rows, N), dtype=numpy.float64)
last = float("NaN")
j = -1
cdef Py_ssize_t tc
cdef np.ndarray[np.float64_t, ndim=1] unknown = numpy.zeros(n_rows, dtype=numpy.float64)
cdef np.float64_t tc
cdef np.ndarray[np.float64_t, ndim=1] col_unknowns = numpy.zeros(n_rows, dtype=numpy.float64)
cdef np.ndarray[np.float64_t, ndim=1] row_unknowns = numpy.zeros(N, dtype=numpy.float64)
cdef np.float64_t unknowns = 0
for i in range(ranks.shape[0]):
i = ranks[i]
v = col_data[i]
tc = classes[i]
if v != last and not npy_isnan(v):
if npy_isnan(v) and npy_isnan(tc):
unknowns += W[i] if weights else 1.
elif npy_isnan(tc):
row_unknowns[j] += W[i] if weights else 1.
elif npy_isnan(v):
col_unknowns[int(tc)] += W[i] if weights else 1.
elif v != last:
j += 1
V[j] = v
last = v
C[tc,j] += W[i] if weights else 1.
elif npy_isnan(v):
unknown[tc] += W[i] if weights else 1.
C[int(tc),j] += W[i] if weights else 1.
else:
C[tc,j] += W[i] if weights else 1.
C[int(tc),j] += W[i] if weights else 1.

assert j == N-1

return V,C,unknown
return (V,C),col_unknowns,row_unknowns,unknowns
7 changes: 4 additions & 3 deletions Orange/data/sql/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,11 +445,12 @@ def _compute_contingency(self, col_vars=None, row_var=None):
data = list(cur.fetchall())
if column.is_continuous:
all_contingencies[i] = \
(self._continuous_contingencies(data, row), [])
(self._continuous_contingencies(data, row), [], [], 0)
else:
all_contingencies[i] =\
(self._discrete_contingencies(data, row, column), [])
return all_contingencies, None
(self._discrete_contingencies(data, row, column), [],
[], 0)
return all_contingencies

def _continuous_contingencies(self, data, row):
values = np.zeros(len(data))
Expand Down
38 changes: 12 additions & 26 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import numpy as np
from scipy import sparse as sp

from Orange.util import OrangeDeprecationWarning
import Orange.data # import for io.py
from Orange.data import (
_contingency, _valuecount,
Expand Down Expand Up @@ -1429,7 +1428,6 @@ def _compute_contingency(self, col_vars=None, row_var=None):
row_data = self._Y[:, row_indi - n_atts]

W = self.W if self.has_weights() else None
nan_inds = None

col_desc = [self.domain[var] for var in col_vars]
col_indi = [self.domain.index(var) for var in col_vars]
Expand All @@ -1442,23 +1440,12 @@ def _compute_contingency(self, col_vars=None, row_var=None):
if row_data.dtype.kind != "f": #meta attributes can be stored as type object
row_data = row_data.astype(float)

unknown_rows = countnans(row_data)
if unknown_rows:
nan_inds = np.isnan(row_data)
row_data = row_data[~nan_inds]
if W:
W = W[~nan_inds]
unknown_rows = np.sum(W[nan_inds])

contingencies = [None] * len(col_desc)
for arr, f_cond, f_ind in (
(self.X, lambda i: 0 <= i < n_atts, lambda i: i),
(self._Y, lambda i: i >= n_atts, lambda i: i - n_atts),
(self.metas, lambda i: i < 0, lambda i: -1 - i)):

if nan_inds is not None:
arr = arr[~nan_inds]

arr_indi = [e for e, ind in enumerate(col_indi) if f_cond(ind)]

vars = [(e, f_ind(col_indi[e]), col_desc[e]) for e in arr_indi]
Expand All @@ -1468,12 +1455,13 @@ def _compute_contingency(self, col_vars=None, row_var=None):
max_vals = max(len(v[2].values) for v in disc_vars)
disc_indi = {i for _, i, _ in disc_vars}
mask = [i in disc_indi for i in range(arr.shape[1])]
conts, nans = contingency(arr, row_data, max_vals - 1,
n_rows - 1, W, mask)
conts, nans_cols, nans_rows, nans = contingency(
arr, row_data, max_vals - 1, n_rows - 1, W, mask)
for col_i, arr_i, var in disc_vars:
n_vals = len(var.values)
contingencies[col_i] = (conts[arr_i][:, :n_vals],
nans[arr_i])
contingencies[col_i] = (
conts[arr_i][:, :n_vals], nans_cols[arr_i],
nans_rows[arr_i], nans[arr_i])
else:
for col_i, arr_i, var in disc_vars:
contingencies[col_i] = contingency(
Expand All @@ -1482,28 +1470,26 @@ def _compute_contingency(self, col_vars=None, row_var=None):

cont_vars = [v for v in vars if v[2].is_continuous]
if cont_vars:

classes = row_data.astype(dtype=np.intp)
W_ = None
if W is not None:
W = W.astype(dtype=np.float64)
W_ = W.astype(dtype=np.float64)
if sp.issparse(arr):
arr = sp.csc_matrix(arr)

for col_i, arr_i, _ in cont_vars:
if sp.issparse(arr):
col_data = arr.data[arr.indptr[arr_i]:arr.indptr[arr_i + 1]]
rows = arr.indices[arr.indptr[arr_i]:arr.indptr[arr_i + 1]]
W_ = None if W is None else W[rows]
classes_ = classes[rows]
W_ = None if W_ is None else W_[rows]
classes_ = row_data[rows]
else:
col_data, W_, classes_ = arr[:, arr_i], W, classes
col_data, W_, classes_ = arr[:, arr_i], W_, row_data

col_data = col_data.astype(dtype=np.float64)
U, C, unknown = _contingency.contingency_floatarray(
contingencies[col_i] = _contingency.contingency_floatarray(
col_data, classes_, n_rows, W_)
contingencies[col_i] = ([U, C], unknown)

return contingencies, unknown_rows
return contingencies

@classmethod
def transpose(cls, table, feature_names_column="",
Expand Down
Loading