Skip to content

Commit a60ebf2

Browse files
committed
rebuild and retest
1 parent be741b2 commit a60ebf2

File tree

16 files changed

+223
-66
lines changed

16 files changed

+223
-66
lines changed

build/lib/data_algebra/__init__.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,37 @@
1919

2020
import data_algebra.data_model
2121
import data_algebra.data_ops
22+
2223
# import for easy access for package users
2324
from data_algebra.data_ops import (
24-
describe_table,
25-
descr,
26-
data,
27-
ex,
25+
describe_table,
26+
descr,
27+
data,
28+
ex,
2829
table
2930
)
3031
import data_algebra.view_representations
3132
from data_algebra.view_representations import (
3233
TableDescription,
3334
ViewRepresentation,
34-
SQLNode
35+
SQLNode,
3536
)
3637
import data_algebra.expr_rep
37-
from data_algebra.expr_rep import lit, col, d_, one
38+
from data_algebra.expr_rep import (
39+
lit, col, d_, one
40+
)
3841

3942
import data_algebra.data_schema
43+
import data_algebra.cdata
44+
from data_algebra.cdata import (
45+
RecordMap,
46+
RecordSpecification,
47+
pivot_blocks_to_rowrecs,
48+
pivot_rowrecs_to_blocks,
49+
pivot_specification,
50+
unpivot_specification
51+
)
4052

4153
import data_algebra.pandas_model
42-
4354
# do this last so everything is loaded/define
4455
data_algebra.pandas_model.register_pandas_model()

build/lib/data_algebra/cdata.py

Lines changed: 108 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def _format_table(
2323
record_id_cols: Iterable[str],
2424
control_id_cols: Iterable[str],
2525
add_style: bool = True,
26-
):
26+
) -> str:
2727
local_data_model = data_algebra.data_model.lookup_data_model_for_dataframe(d)
2828
d = local_data_model.to_pandas(d)
2929
pd = data_algebra.data_model.lookup_data_model_for_dataframe(d).pd
@@ -52,12 +52,13 @@ def _format_table(
5252
d = d.style.set_properties(
5353
**{"background-color": "#FFE4C4"}, subset=record_id_col_pairs
5454
).set_properties(**{"background-color": "#7FFFD4"}, subset=control_id_col_pairs)
55-
return d
55+
return d._repr_html_()
5656

5757

5858
class RecordSpecification:
5959
"""
60-
Class to represent a multi-row data record.
60+
Class to represent a data record.
61+
For single row data records use None as the specification.
6162
"""
6263

6364
row_columns: List[str] # columns when in row form
@@ -93,11 +94,22 @@ def __init__(
9394
control_table = local_data_model.clean_copy(control_table)
9495
if control_table.shape[0] < 1:
9596
raise ValueError("control table should have at least 1 row")
97+
if control_table.shape[1] < 2:
98+
raise ValueError("control table must have at least 2 columns")
9699
if len(control_table.columns) != len(set(control_table.columns)):
97100
raise ValueError("control table columns should be unique")
101+
if control_table_keys is None:
102+
if control_table.shape[0] > 1:
103+
control_table_keys = [control_table.columns[0]]
104+
else:
105+
control_table_keys = [] # single row records don't need to be keyed
106+
if isinstance(control_table_keys, str):
107+
control_table_keys = [control_table_keys]
108+
else:
109+
control_table_keys = list(control_table_keys)
110+
assert isinstance(control_table_keys, List)
98111
if strict:
99112
if control_table.shape[0] > 1:
100-
assert control_table_keys is not None
101113
assert len(control_table_keys) > 0
102114
assert local_data_model.table_is_keyed_by_columns(
103115
control_table, column_names=control_table_keys
@@ -106,14 +118,12 @@ def __init__(
106118
assert self.control_table.shape[0] > 0
107119
if record_keys is None:
108120
record_keys = []
109-
if isinstance(record_keys, str):
121+
elif isinstance(record_keys, str):
110122
record_keys = [record_keys]
111-
self.record_keys = list(record_keys)
112-
if control_table_keys is None:
113-
if self.control_table.shape[0] > 1:
114-
control_table_keys = [self.control_table.columns[0]]
115-
else:
116-
control_table_keys = [] # single row records don't need to be keyed
123+
else:
124+
record_keys = list(record_keys)
125+
assert isinstance(record_keys, list)
126+
self.record_keys = record_keys
117127
if isinstance(control_table_keys, str):
118128
control_table_keys = [control_table_keys]
119129
if self.control_table.shape[0] > 1:
@@ -182,12 +192,55 @@ def row_version(self, *, include_record_keys: bool = True) -> List[str]:
182192
:param include_record_keys: logical, if True include record keys as columns
183193
:return: column list
184194
"""
195+
assert isinstance(include_record_keys, bool)
185196
cols: List[str] = []
186197
if include_record_keys:
187198
cols = cols + self.record_keys
188199
cols = cols + self.content_keys
189200
return cols
201+
202+
def row_record_form(self):
203+
"""
204+
Return specification of matching row record form.
205+
Note: prefer using None to specify row records specs.
206+
"""
207+
local_data_model = data_algebra.data_model.lookup_data_model_for_dataframe(self.control_table)
208+
row_vals = self.row_version(include_record_keys=False)
209+
ct = local_data_model.data_frame({
210+
k: [k] for k in row_vals
211+
})
212+
v_set = set(self.row_version(include_record_keys=False))
213+
return RecordSpecification(
214+
ct,
215+
record_keys=self.record_keys,
216+
control_table_keys=[],
217+
strict=self.strict,
218+
local_data_model=local_data_model,
219+
)
220+
221+
def value_column_form(self, *, key_column_name: str = "measure", value_column_name: str = "value"):
222+
"""
223+
Return specification of the matching value column form.
224+
Note: for type safety prefer map_to_rows() to map_to_keyed_column().
190225
226+
:param key_column_name: name for additional keying column
227+
:param value_column_name: name for value column
228+
"""
229+
assert isinstance(key_column_name, str)
230+
assert isinstance(value_column_name, str)
231+
local_data_model = data_algebra.data_model.lookup_data_model_for_dataframe(self.control_table)
232+
ct = local_data_model.data_frame({
233+
key_column_name: self.row_version(include_record_keys=False),
234+
value_column_name: self.row_version(include_record_keys=False),
235+
})
236+
return RecordSpecification(
237+
ct,
238+
record_keys=self.record_keys,
239+
control_table_keys=[key_column_name],
240+
strict=self.strict,
241+
local_data_model=local_data_model,
242+
)
243+
191244
def __repr__(self):
192245
s = (
193246
"data_algebra.cdata.RecordSpecification(\n"
@@ -248,7 +301,12 @@ def _repr_html_(self):
248301
+ _str_list_to_html(self.control_table_keys)
249302
+ "</li>\n"
250303
+ "<li>control_table:<br>\n"
251-
+ self.control_table._repr_html_()
304+
+ _format_table(
305+
self.control_table,
306+
record_id_cols=self.record_keys,
307+
control_id_cols=self.control_table_keys,
308+
add_style=True,
309+
)
252310
+ "</li>\n"
253311
+ "</ul>"
254312
+ "</p>\n"
@@ -266,7 +324,8 @@ def map_to_rows(self):
266324
267325
:return: RecordMap
268326
"""
269-
327+
if self.control_table.shape[0] <= 1:
328+
raise ValueError("already in row record format")
270329
return RecordMap(blocks_in=self, strict=self.strict)
271330

272331
def map_from_rows(self):
@@ -275,8 +334,33 @@ def map_from_rows(self):
275334
276335
:return: RecordMap
277336
"""
278-
337+
if self.control_table.shape[0] <= 1:
338+
raise ValueError("already in row record format")
279339
return RecordMap(blocks_out=self, strict=self.strict)
340+
341+
def map_to_keyed_column(self, *, key_column_name: str = "measure", value_column_name: str = "value"):
342+
"""
343+
Build a RecordMap mapping this RecordSpecification to a table
344+
where only one column holds values.
345+
Note: for type safety prefer map_to_rows() to map_to_keyed_column().
346+
347+
348+
:param key_column_name: name for additional keying column
349+
:param value_column_name: name for value column
350+
:return: Record map
351+
"""
352+
return RecordMap(blocks_in=self, blocks_out=self.value_column_form(), strict=self.strict)
353+
354+
def map_from_keyed_column(self, *, key_column_name: str = "measure", value_column_name: str = "value"):
355+
"""
356+
Build a RecordMap mapping this RecordSpecification from a table
357+
where only one column holds values.
358+
359+
:param key_column_name: name for additional keying column
360+
:param value_column_name: name for value column
361+
:return: Record map
362+
"""
363+
return RecordMap(blocks_in=self.value_column_form(), blocks_out=self, strict=self.strict)
280364

281365

282366
class RecordMap(ShiftPipeAction):
@@ -309,16 +393,23 @@ def __init__(
309393
assert isinstance(blocks_in, RecordSpecification)
310394
if strict:
311395
assert blocks_in.strict
396+
if blocks_in.control_table.shape[0] <= 1:
397+
blocks_in = None
398+
if blocks_in is not None:
312399
ck = [k for k in blocks_in.content_keys if k is not None]
313400
if len(ck) != len(set(ck)):
314401
raise ValueError("blocks_in can not have duplicate content keys")
402+
if blocks_in.control_table.shape[0] <= 1:
403+
raise ValueError("for row records use None specification")
315404
if blocks_out is not None:
316405
assert isinstance(blocks_out, RecordSpecification)
317406
if strict:
318407
assert blocks_out.strict
408+
if blocks_out.control_table.shape[0] <= 1:
409+
blocks_out = None
319410
if (blocks_in is None) and (blocks_out is None):
320411
raise ValueError(
321-
"At least one of blocks_in or blocks_out should not be None"
412+
"At least one of blocks_in or blocks_out should not be None or a non-row record"
322413
)
323414
if (blocks_in is not None) and (blocks_out is not None):
324415
unknown = set(blocks_out.record_keys) - set(blocks_in.record_keys)
@@ -658,9 +749,9 @@ def _repr_html_(self):
658749
)
659750
s = (
660751
"RecordMap: transforming records of the form:<br>\n"
661-
+ example_input_formatted._repr_html_()
752+
+ example_input_formatted
662753
+ "<br>\nto records of the form:<br>\n"
663-
+ example_output_formatted._repr_html_()
754+
+ example_output_formatted
664755
)
665756
return s
666757

build/lib/data_algebra/data_schema.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
have_polars = False
1313
try:
1414
import polars as pl # conditional import
15+
1516
have_polars = True
1617
except ModuleNotFoundError:
1718
pass
@@ -75,7 +76,9 @@ def non_null_types_in_frame(d) -> Dict[str, Optional[Set[Type]]]:
7576
# check that this is a data frame, so we can raise a legible error
7677
# instead of having an non-existent attribute error raising
7778
if not is_data_frame(d):
78-
raise TypeError(f"expected a Pandas or Polars data frame, had {_type_name(type(d))}")
79+
raise TypeError(
80+
f"expected a Pandas or Polars data frame, had {_type_name(type(d))}"
81+
)
7982
result = dict()
8083
for col_name in d.columns:
8184
types_seen = {type(vi) for vi in d[col_name] if not _is_null(vi)}
@@ -139,7 +142,9 @@ class SchemaRaises(SchemaBase):
139142
Raises TypeError on schema violations.
140143
"""
141144

142-
def _check_data_frame_matches_schema(self, *, d, expected_type: Optional[Dict]) -> Optional[str]:
145+
def _check_data_frame_matches_schema(
146+
self, *, d, expected_type: Optional[Dict]
147+
) -> Optional[str]:
143148
assert isinstance(expected_type, (type(None), Dict))
144149
# check that this is a data frame, so we can raise a legible error
145150
# instead of having an non-existent attribute error raising
@@ -190,7 +195,9 @@ def _check_spec(
190195
)
191196
return f"expected type one of {type_names}, found type {_type_name(observed_type)}"
192197
elif isinstance(expected_type, dict):
193-
schema_issue = self._check_data_frame_matches_schema(d=observed_value, expected_type=expected_type)
198+
schema_issue = self._check_data_frame_matches_schema(
199+
d=observed_value, expected_type=expected_type
200+
)
194201
if schema_issue is not None:
195202
return schema_issue
196203
else:

build/lib/data_algebra/expr_parse_fn.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,35 @@
1-
2-
31
"""
42
Parse expresion to data ops in a medium environment. Uses eval() (don't run on untrusted expressions).
3+
Used for confirming repr() is reversible.
54
"""
65

76

87
import data_algebra
98
import data_algebra.data_model
109
import numpy as np # for globals() in eval_da_ops()
11-
from data_algebra.data_ops import describe_table, table, descr, data # for globals() in eval_da_ops()
12-
from data_algebra.view_representations import ViewRepresentation, TableDescription # for globals() in eval_da_ops()
13-
pd = data_algebra.data_model.default_data_model().pd # for globals() in eval_da_ops()
10+
from data_algebra.data_ops import (
11+
describe_table,
12+
table,
13+
descr,
14+
data,
15+
) # for globals() in eval_da_ops()
16+
from data_algebra.view_representations import (
17+
ViewRepresentation,
18+
TableDescription,
19+
) # for globals() in eval_da_ops()
1420

15-
g_env = globals()
21+
pd = data_algebra.data_model.default_data_model().pd # for globals() in eval_da_ops()
22+
g_env = {k: v for k, v in globals().items()}
1623

1724
from typing import Any, Dict, Optional
1825

1926

20-
def eval_da_ops(ops_str: str, *, data_model_map: Optional[Dict[str, Any]]) -> ViewRepresentation:
27+
def eval_da_ops(
28+
ops_str: str, *, data_model_map: Optional[Dict[str, Any]]
29+
) -> ViewRepresentation:
2130
"""
2231
Parse ops_str into a ViewRepresentation. Uses eval() (don't run on untrusted expressions).
32+
Used for confirming repr() is reversible.
2333
2434
:param ops_str: text representation of a data algebra pipeline or expression.
2535
:param data_model_map: tables
@@ -39,4 +49,4 @@ def eval_da_ops(ops_str: str, *, data_model_map: Optional[Dict[str, Any]]) -> Vi
3949
# cdata uses this
4050
)
4151
assert isinstance(ops, ViewRepresentation)
42-
return ops
52+
return ops

build/lib/data_algebra/flow_text.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
def flow_text(tokens, *, align_right=70, sep_width=1):
77
"""
8+
Flow text around a margin for presentation.
89
910
:param tokens: list or tuple of strings
1011
:param align_right: integer, right alignment margin

build/lib/data_algebra/fmt_python.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
_have_black = False
32
try:
43
# noinspection PyUnresolvedReferences

build/lib/data_algebra/test_util.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@
4242
global_test_result_cache: Optional[data_algebra.eval_cache.ResultCache] = None
4343

4444

45-
def _re_parse(ops: ViewRepresentation, *, data_model_map: Dict[str, Any]) -> ViewRepresentation:
45+
def _re_parse(
46+
ops: ViewRepresentation, *, data_model_map: Dict[str, Any]
47+
) -> ViewRepresentation:
4648
"""
4749
Return copy of object made by dumping to string via repr() and then evaluating that string.
4850
"""

0 commit comments

Comments
 (0)