|
24 | 24 | from AnyQt.QtCore import pyqtSignal as Signal |
25 | 25 |
|
26 | 26 | import Orange.data |
| 27 | +import Orange.statistics.util as util |
27 | 28 | from Orange.widgets import widget, gui, settings |
28 | 29 | from Orange.widgets.utils import itemmodels, colorpalette |
29 | 30 | from Orange.widgets.utils.annotated_data import (create_annotated_table, |
@@ -883,74 +884,33 @@ def unique_non_nan(ar): |
883 | 884 | return uniq[~numpy.isnan(uniq)] |
884 | 885 |
|
885 | 886 |
|
886 | | -def varying_matrix(X, idmap): |
887 | | - """ |
888 | | - Enables efficient implementation of `varying_between` |
889 | | - for continuous and discrete variables. |
890 | | -
|
891 | | - Return indices of non-constant matrix columns between groups |
892 | | - defined by indices. Supports sparse data. |
893 | | - """ |
894 | | - # Map rows to groups |
895 | | - inv_idmap = dict() |
896 | | - for g, rows in idmap.items(): |
897 | | - for r in rows: inv_idmap[r] = g |
898 | | - |
899 | | - # Group columns accordingly |
900 | | - vals_per_group = dict() |
901 | | - rows_per_group = dict() |
902 | | - varying = set() |
903 | | - |
904 | | - # Remove columns with duplicate non-zero, non-nan values |
905 | | - for r, c in zip(*X.nonzero()): |
906 | | - g = inv_idmap[r] |
907 | | - rows_per_group[g, c] = \ |
908 | | - rows_per_group.get((g, c), 0) + 1 |
909 | | - val = X[r, c] |
910 | | - if isinstance(val, str) or not numpy.isnan(val): |
911 | | - if (g, c) not in vals_per_group: |
912 | | - vals_per_group[g, c] = set() |
913 | | - vals_per_group[g, c].add(val) |
914 | | - if len(vals_per_group[g, c]) > 1: |
915 | | - varying.add(c) |
916 | | - |
917 | | - # Remove columns with mixed zero and non-zero values |
918 | | - for (g, c), cnt in rows_per_group.items(): |
919 | | - if (g, c) in vals_per_group \ |
920 | | - and len(vals_per_group[g, c]) \ |
921 | | - and (0 < cnt < len(idmap[g])): |
922 | | - varying.add(c) |
923 | | - return varying |
924 | | - |
925 | | - |
926 | 887 | def varying_between(table, idvar): |
927 | 888 | """ |
928 | 889 | Return a list of all variables with non constant values between |
929 | 890 | groups defined by `idvar`. |
930 | 891 |
|
931 | | - I.e. columns where values differ by group: |
932 | | - - If each example is its own group, this operation is trivial. |
933 | | -
|
934 | 892 | """ |
| 893 | + all_possible = [var for var in table.domain.variables + table.domain.metas |
| 894 | + if var != idvar] |
| 895 | + candidate_set = set(all_possible) |
935 | 896 |
|
936 | | - # idvar is not varying, so it would not be removed from the candidate_set |
937 | 897 | idmap = group_table_indices(table, idvar) |
938 | | - varying = set() |
939 | | - |
940 | | - # Trivial case |
941 | | - if len(idmap) == len(table): |
942 | | - return varying |
943 | | - |
944 | | - varying_metas = varying_matrix(table.metas, idmap) |
945 | | - varying_data = varying_matrix(table.X, idmap) |
946 | 898 |
|
947 | | - varying = [var for vi, var in enumerate(table.domain.variables) |
948 | | - if var != idvar and vi in varying_data] + \ |
949 | | - [var for vi, var in enumerate(table.domain.metas) |
950 | | - if var != idvar and vi in varying_metas] |
| 899 | + varying = set() |
| 900 | + for indices in idmap.values(): |
| 901 | + subset = table[indices] |
| 902 | + for var in list(candidate_set): |
| 903 | + column, _ = subset.get_column_view(var) |
| 904 | + values = util.unique(column) |
| 905 | + |
| 906 | + if var.is_string: |
| 907 | + uniq = set(values) |
| 908 | + else: |
| 909 | + uniq = unique_non_nan(values) |
951 | 910 |
|
952 | | - all_possible = [var for var in (table.domain.variables + table.domain.metas) |
953 | | - if var != idvar] |
| 911 | + if len(uniq) > 1: |
| 912 | + varying.add(var) |
| 913 | + candidate_set.remove(var) |
954 | 914 |
|
955 | 915 | return sorted(varying, key=all_possible.index) |
956 | 916 |
|
|
0 commit comments