Skip to content

Commit 23fd764

Browse files
Merge pull request #403 from Crunch-io/rework-subtotals-188552693
[#188552693]: Rework subtotals
2 parents adb1c24 + f4f992e commit 23fd764

File tree

10 files changed

+955
-167
lines changed

10 files changed

+955
-167
lines changed

src/cr/cube/matrix/measure.py

Lines changed: 59 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
OverlapSubtotals,
1414
PositiveTermSubtotals,
1515
SumSubtotals,
16+
WaveDiffSubtotal,
1617
)
1718
from cr.cube.smoothing import Smoother
1819
from cr.cube.util import lazyproperty
@@ -799,7 +800,13 @@ def _subtotal_columns(self):
799800
"""
800801
# --- do not propagate divide-by-zero warnings to stderr ---
801802
with np.errstate(divide="ignore", invalid="ignore"):
802-
return self._count_blocks[0][1] / self._weighted_base_blocks[0][1]
803+
default_value = self._count_blocks[0][1] / self._weighted_base_blocks[0][1]
804+
return WaveDiffSubtotal.subtotal_columns(
805+
self._weighted_cube_counts.column_bases,
806+
self._weighted_cube_counts.counts,
807+
default_value,
808+
self._dimensions,
809+
)
803810

804811
@lazyproperty
805812
def _subtotal_rows(self):
@@ -809,7 +816,13 @@ def _subtotal_rows(self):
809816
"""
810817
# --- do not propagate divide-by-zero warnings to stderr ---
811818
with np.errstate(divide="ignore", invalid="ignore"):
812-
return self._count_blocks[1][0] / self._weighted_base_blocks[1][0]
819+
default_value = self._count_blocks[1][0] / self._weighted_base_blocks[1][0]
820+
return WaveDiffSubtotal.subtotal_rows(
821+
self._weighted_cube_counts.column_bases,
822+
self._weighted_cube_counts.counts,
823+
default_value,
824+
self._dimensions,
825+
)
813826

814827
@lazyproperty
815828
def _weighted_base_blocks(self):
@@ -1564,17 +1577,18 @@ def _column_bases(self):
15641577
if self._second_order_measures.columns_squared_base.is_defined:
15651578
weighted_blocks = self._second_order_measures.column_weighted_bases.blocks
15661579
squared_blocks = self._second_order_measures.column_squared_bases.blocks
1567-
effective_blocks = [
1568-
[
1569-
weighted_blocks[0][0] ** 2 / squared_blocks[0][0],
1570-
weighted_blocks[0][1] ** 2 / squared_blocks[0][1],
1571-
],
1572-
[
1573-
weighted_blocks[1][0] ** 2 / squared_blocks[1][0],
1574-
weighted_blocks[1][1] ** 2 / squared_blocks[1][1],
1575-
],
1576-
]
1577-
return effective_blocks
1580+
with np.errstate(divide="ignore", invalid="ignore"):
1581+
effective_blocks = [
1582+
[
1583+
weighted_blocks[0][0] ** 2 / squared_blocks[0][0],
1584+
weighted_blocks[0][1] ** 2 / squared_blocks[0][1],
1585+
],
1586+
[
1587+
weighted_blocks[1][0] ** 2 / squared_blocks[1][0],
1588+
weighted_blocks[1][1] ** 2 / squared_blocks[1][1],
1589+
],
1590+
]
1591+
return effective_blocks
15781592

15791593
unweighted_blocks = self._second_order_measures.column_unweighted_bases.blocks
15801594
return unweighted_blocks
@@ -1845,6 +1859,14 @@ class _RowProportions(_BaseSecondOrderMeasure):
18451859
contributed by the weighted count of each matrix cell.
18461860
"""
18471861

1862+
@lazyproperty
1863+
def _count_blocks(self):
1864+
return self._second_order_measures.weighted_counts.blocks
1865+
1866+
@lazyproperty
1867+
def _weighted_base_blocks(self):
1868+
return self._second_order_measures.row_weighted_bases.blocks
1869+
18481870
@lazyproperty
18491871
def blocks(self):
18501872
"""Nested list of the four 2D ndarray "blocks" making up this measure.
@@ -1854,8 +1876,8 @@ def blocks(self):
18541876
18551877
Row-proportions are row comparable counts divided by the row weighted bases.
18561878
"""
1857-
count_blocks = self._second_order_measures.weighted_counts.blocks
1858-
weighted_base_blocks = self._second_order_measures.row_weighted_bases.blocks
1879+
count_blocks = self._count_blocks
1880+
weighted_base_blocks = self._weighted_base_blocks
18591881

18601882
# --- do not propagate divide-by-zero warnings to stderr ---
18611883
with np.errstate(divide="ignore", invalid="ignore"):
@@ -1864,16 +1886,36 @@ def blocks(self):
18641886
# --- base values ---
18651887
count_blocks[0][0] / weighted_base_blocks[0][0],
18661888
# --- inserted columns ---
1867-
count_blocks[0][1] / weighted_base_blocks[0][1],
1889+
self._inserted_columns,
18681890
],
18691891
[
18701892
# --- inserted rows ---
1871-
count_blocks[1][0] / weighted_base_blocks[1][0],
1893+
self._inserted_rows,
18721894
# --- intersections ---
18731895
count_blocks[1][1] / weighted_base_blocks[1][1],
18741896
],
18751897
]
18761898

1899+
@lazyproperty
1900+
def _inserted_rows(self):
1901+
default_value = self._count_blocks[1][0] / self._weighted_base_blocks[1][0]
1902+
return WaveDiffSubtotal.subtotal_rows(
1903+
self._weighted_cube_counts.row_bases,
1904+
self._weighted_cube_counts.counts,
1905+
default_value,
1906+
self._dimensions,
1907+
)
1908+
1909+
@lazyproperty
1910+
def _inserted_columns(self):
1911+
default_value = self._count_blocks[0][1] / self._weighted_base_blocks[0][1]
1912+
return WaveDiffSubtotal.subtotal_columns(
1913+
self._weighted_cube_counts.row_bases,
1914+
self._weighted_cube_counts.counts,
1915+
default_value,
1916+
self._dimensions,
1917+
)
1918+
18771919

18781920
class _RowShareSum(_BaseSecondOrderMeasure):
18791921
"""Provides the row share of sum measure for a matrix.

src/cr/cube/matrix/subtotals.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import numpy as np
1414

15+
from cr.cube.enums import DIMENSION_TYPE as DT
1516
from cr.cube.util import lazyproperty
1617

1718

@@ -370,6 +371,136 @@ def _subtotal_row(self, subtotal):
370371
return addend_sum - subtrahend_sum
371372

372373

374+
class WaveDiffSubtotal:
375+
"""Subtotal "blocks" created by adding and subtracting terms for wave differences.
376+
377+
This class handles a special case for wave differences when a CAT_DATE variable is
378+
involved in the calculation.
379+
380+
A wave difference for a CAT_DATE variable is calculate subtracting at the
381+
percentages level: (count1/base1) - (count2/base2).
382+
"""
383+
384+
def __init__(self, base_values, counts, default_insertions, dimensions):
385+
self._base_values = base_values
386+
self._counts = counts
387+
self._default_insertions = default_insertions
388+
self._dimensions = dimensions
389+
390+
@classmethod
391+
def subtotal_columns(cls, base_values, counts, default_insertions, dimensions):
392+
"""Return (n_column_subtotals, n_base_rows) ndarray of subtotal columns."""
393+
return cls(
394+
base_values, counts, default_insertions, dimensions
395+
)._subtotal_columns
396+
397+
@classmethod
398+
def subtotal_rows(cls, base_values, counts, default_insertions, dimensions):
399+
"""Return (n_row_subtotals, n_base_cols) ndarray of subtotal rows.
400+
Keyword arguments:
401+
`diff_cols_nan` -- Overrides subtotal differences in the columns direction eg
402+
for column bases (default False)
403+
`diff_rows_nan` -- Overrides subtotal differences in the rows direction eg for
404+
row bases (default False)
405+
"""
406+
return cls(base_values, counts, default_insertions, dimensions)._subtotal_rows
407+
408+
@lazyproperty
409+
def _column_subtotals(self):
410+
"""Sequence of _Subtotal object for each subtotal in columns-dimension."""
411+
return self._dimensions[1].subtotals
412+
413+
def _multiple_subtrahends_or_addends(self, subtotal):
414+
"""Returns true if the subtotal has multiple addend or subtrahend terms."""
415+
return any(subtotal.subtrahend_idxs) and (
416+
len(subtotal.subtrahend_idxs) > 1 or len(subtotal.addend_idxs) > 1
417+
)
418+
419+
def _nan_subtotals(self, axis):
420+
"""Generate an array filled with NaN values.
421+
422+
Matches the size of the specified axis of the base values.
423+
"""
424+
return np.full(self._base_values.shape[axis], np.nan)
425+
426+
@lazyproperty
427+
def _row_subtotals(self):
428+
"""Sequence of _Subtotal object for each subtotal in rows-dimension."""
429+
return self._dimensions[0].subtotals
430+
431+
@lazyproperty
432+
def _subtotal_rows(self):
433+
"""(n_row_subtotals, n_cols) ndarray of subtotal rows."""
434+
subtotals = self._row_subtotals
435+
n_cols = self._base_values.shape[1]
436+
if len(subtotals) == 0:
437+
return np.empty((0, n_cols))
438+
439+
return np.vstack(
440+
[
441+
self._subtotal_row(subtotal, default)
442+
for subtotal, default in zip(subtotals, self._default_insertions)
443+
]
444+
)
445+
446+
@lazyproperty
447+
def _subtotal_columns(self):
448+
"""(n_rows, n_col_subtotals) matrix of subtotal columns."""
449+
subtotals = self._column_subtotals
450+
n_rows = self._base_values.shape[0]
451+
if len(subtotals) == 0:
452+
return np.empty((n_rows, 0))
453+
return np.hstack(
454+
[
455+
self._subtotal_column(subtotal, default).reshape(n_rows, 1)
456+
for subtotal, default in zip(subtotals, self._default_insertions.T)
457+
]
458+
)
459+
460+
def _subtotal_column(self, subtotal, default):
461+
"""Return (n_rows,) ndarray of values for `subtotal` column."""
462+
if self._dimensions[1].dimension_type == DT.CAT_DATE and any(
463+
subtotal.subtrahend_idxs
464+
):
465+
if self._multiple_subtrahends_or_addends(subtotal):
466+
return self._nan_subtotals(axis=0)
467+
base_addend_sum = np.sum(self._base_values[:, subtotal.addend_idxs], axis=1)
468+
base_subtrahend_sum = np.sum(
469+
self._base_values[:, subtotal.subtrahend_idxs], axis=1
470+
)
471+
counts_addend_sum = np.sum(self._counts[:, subtotal.addend_idxs], axis=1)
472+
counts_subtrahend_sum = np.sum(
473+
self._counts[:, subtotal.subtrahend_idxs], axis=1
474+
)
475+
return (counts_addend_sum / base_addend_sum) - (
476+
counts_subtrahend_sum / base_subtrahend_sum
477+
)
478+
479+
return default
480+
481+
def _subtotal_row(self, subtotal, default):
482+
"""Return (n_cols,) ndarray of values for `subtotal` row."""
483+
484+
if self._dimensions[0].dimension_type == DT.CAT_DATE and any(
485+
subtotal.subtrahend_idxs
486+
):
487+
if self._multiple_subtrahends_or_addends(subtotal):
488+
return self._nan_subtotals(axis=1)
489+
base_addend_sum = np.sum(self._base_values[subtotal.addend_idxs, :], axis=0)
490+
base_subtrahend_sum = np.sum(
491+
self._base_values[subtotal.subtrahend_idxs, :], axis=0
492+
)
493+
counts_addend_sum = np.sum(self._counts[subtotal.addend_idxs, :], axis=0)
494+
counts_subtrahend_sum = np.sum(
495+
self._counts[subtotal.subtrahend_idxs, :], axis=0
496+
)
497+
return (counts_addend_sum / base_addend_sum) - (
498+
counts_subtrahend_sum / base_subtrahend_sum
499+
)
500+
501+
return default
502+
503+
373504
class OverlapSubtotals(SumSubtotals):
374505
"""Subtotal blocks used exclusively for the "overlap" cube measure.
375506

src/cr/cube/stripe/insertion.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import numpy as np
1414

15+
from cr.cube.enums import DIMENSION_TYPE as DT
1516
from cr.cube.util import lazyproperty
1617

1718

@@ -104,3 +105,64 @@ def _subtotal_value(self, subtotal):
104105
subtrahend_sum = np.sum(base_values[subtotal.subtrahend_idxs])
105106

106107
return addend_sum - subtrahend_sum
108+
109+
110+
class WaveDiffSubtotals(_BaseSubtotals):
111+
"""Subtotal "blocks" created by adding and subtracting terms for wave differences.
112+
113+
This class handles a special case for wave differences when a CAT_DATE variable is
114+
involved in the calculation.
115+
116+
A wave difference for a CAT_DATE variable is calculate subtracting at the
117+
percentages level: (count1/base1) - (count2/base2).
118+
"""
119+
120+
def __init__(self, base_values, counts, default_values, rows_dimension):
121+
super(WaveDiffSubtotals, self).__init__(base_values, rows_dimension)
122+
self._counts = counts
123+
self._default_values = default_values
124+
125+
@classmethod
126+
def subtotal_values(cls, base_values, counts, default_values, rows_dimension):
127+
"""Return (n_row_subtotals,) ndarray of subtotal values."""
128+
return cls(base_values, counts, default_values, rows_dimension)._subtotal_values
129+
130+
def _multiple_subtrahends_or_addends(self, subtotal):
131+
"""Returns true if the subtotal has multiple addend or subtrahend terms."""
132+
return any(subtotal.subtrahend_idxs) and (
133+
len(subtotal.subtrahend_idxs) > 1 or len(subtotal.addend_idxs) > 1
134+
)
135+
136+
@lazyproperty
137+
def _subtotal_values(self):
138+
"""(n_row_subtotals,) ndarray of subtotal values for stripe."""
139+
subtotals = self._row_subtotals
140+
141+
if len(subtotals) == 0:
142+
return np.array([])
143+
144+
if self._rows_dimension.dimension_type != DT.CAT_DATE:
145+
return self._default_values
146+
147+
return np.array(
148+
[
149+
self._subtotal_value(subtotal, default)
150+
for subtotal, default in zip(subtotals, self._default_values)
151+
]
152+
)
153+
154+
def _subtotal_value(self, subtotal, default):
155+
"""Return scalar value of wafe diff `subtotal` row."""
156+
if len(subtotal.subtrahend_idxs) > 0 and len(subtotal.addend_idxs) > 0:
157+
if self._multiple_subtrahends_or_addends(subtotal):
158+
return np.nan
159+
base_values = self._base_values
160+
counts = self._counts
161+
base_addend_sum = np.sum(base_values[subtotal.addend_idxs])
162+
base_subtrahend_sum = np.sum(base_values[subtotal.subtrahend_idxs])
163+
counts_addend_sum = np.sum(counts[subtotal.addend_idxs])
164+
counts_subtrahend_sum = np.sum(counts[subtotal.subtrahend_idxs])
165+
return (counts_addend_sum / base_addend_sum) - (
166+
counts_subtrahend_sum / base_subtrahend_sum
167+
)
168+
return default

src/cr/cube/stripe/measure.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
NegativeTermSubtotals,
1313
PositiveTermSubtotals,
1414
SumSubtotals,
15+
WaveDiffSubtotals,
1516
)
1617
from cr.cube.util import lazyproperty
1718

@@ -628,7 +629,13 @@ def subtotal_values(self):
628629

629630
# --- do not propagate divide-by-zero warnings to stderr ---
630631
with np.errstate(divide="ignore", invalid="ignore"):
631-
return subtotal_values / weighted_table_base
632+
default_value = subtotal_values / weighted_table_base
633+
return WaveDiffSubtotals.subtotal_values(
634+
self._weighted_cube_counts.bases,
635+
self._weighted_cube_counts.counts,
636+
default_value,
637+
self._rows_dimension,
638+
)
632639

633640

634641
class _UnweightedBases(_BaseSecondOrderMeasure):

0 commit comments

Comments
 (0)