Skip to content

Commit da2a26d

Browse files
committed
rebuild and retest
1 parent fd95406 commit da2a26d

File tree

15 files changed

+4511
-4214
lines changed

15 files changed

+4511
-4214
lines changed

build/lib/data_algebra/data_model.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,18 @@ def table_is_keyed_by_columns(self, table, *, column_names: Iterable[str]) -> bo
7272
:return: True if rows are uniquely keyed by values in named columns
7373
"""
7474

75+
@abc.abstractmethod
76+
def concat_rows(self, frame_list: List):
77+
"""
78+
Concatenate rows from frame_list
79+
"""
80+
81+
@abc.abstractmethod
82+
def concat_columns(self, frame_list):
83+
"""
84+
Concatenate columns from frame_list
85+
"""
86+
7587
# evaluate
7688

7789
@abc.abstractmethod

build/lib/data_algebra/pandas_base.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -456,14 +456,24 @@ def bad_column_positions(self, x):
456456
self.pd.isnull(x), numpy.logical_or(numpy.isnan(x), numpy.isinf(x))
457457
)
458458
return self.pd.isnull(x)
459+
460+
def concat_rows(self, frame_list: List):
461+
"""
462+
Concatenate rows from frame_list
463+
"""
464+
frame_list = list(frame_list)
465+
assert len(frame_list) > 0
466+
if len(frame_list) == 1:
467+
return self.clean_copy(frame_list[0])
468+
res = self.pd.concat(frame_list, axis=0)
469+
return res
459470

460-
def concat_columns(self, frame_list):
471+
def concat_columns(self, frame_list: List):
461472
"""
462-
Concatinate columns from frame_list
473+
Concatenate columns from frame_list
463474
"""
464475
frame_list = list(frame_list)
465-
if len(frame_list) <= 0:
466-
return None
476+
assert len(frame_list) > 0
467477
if len(frame_list) == 1:
468478
return self.clean_copy(frame_list[0])
469479
res = self.pd.concat(frame_list, axis=1)

build/lib/data_algebra/polars_model.py

Lines changed: 58 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def _populate_expr_impl_map() -> Dict[int, Dict[str, Callable]]:
229229
"as_int64": lambda x: x.cast(int),
230230
"as_str": lambda x: x.cast(str),
231231
"base_Sunday": lambda x: x.base_Sunday(),
232-
"bfill": lambda x: x.bfill(),
232+
"bfill": lambda x: x.fill_null(strategy='backward'),
233233
"ceil": lambda x: x.ceil(),
234234
"coalesce0": lambda x: pl.when(x.is_null()).then(pl.col(_da_temp_zero_column_name)).otherwise(x),
235235
"cos": lambda x: x.cos(),
@@ -246,7 +246,7 @@ def _populate_expr_impl_map() -> Dict[int, Dict[str, Callable]]:
246246
"dayofyear": lambda x: x.dayofyear(),
247247
"exp": lambda x: x.exp(),
248248
"expm1": lambda x: x.expm1(),
249-
"ffill": lambda x: x.ffill(),
249+
"ffill": lambda x: x.fill_null(strategy='forward'),
250250
"first": lambda x: x.first(),
251251
"floor": lambda x: x.floor(),
252252
"format_date": lambda x: x.format_date(),
@@ -289,7 +289,6 @@ def _populate_expr_impl_map() -> Dict[int, Dict[str, Callable]]:
289289
"%/%": lambda a, b: a / b,
290290
"around": lambda a, b: a.round(b),
291291
"coalesce": lambda a, b: pl.when(a.is_null()).then(b).otherwise(a),
292-
"concat": lambda a, b: a.concat(b),
293292
"date_diff": lambda a, b: a.date_diff(b),
294293
"is_in": lambda a, b: a.is_in(b),
295294
"mod": lambda a, b: a % b,
@@ -310,10 +309,10 @@ def _populate_expr_impl_map() -> Dict[int, Dict[str, Callable]]:
310309
"parse_datetime": lambda x, format : x.cast(str).str.strptime(pl.Datetime, fmt=format, strict=False).cast(pl.Datetime),
311310
}
312311
impl_map_3 = {
313-
"if_else": lambda a, b, c: pl.when(a).then(b).otherwise(c),
312+
"if_else": lambda a, b, c: pl.when(a.is_null()).then(pl.lit(None)).otherwise(pl.when(a).then(b).otherwise(c)),
314313
"mapv": _mapv,
315314
"trimstr": lambda a, b, c: a.trimstr(b, c),
316-
"where": lambda a, b, c: pl.when(a).then(b).otherwise(c),
315+
"where": lambda a, b, c: pl.when(a.is_null()).then(c).otherwise(pl.when(a).then(b).otherwise(c)),
317316
}
318317
impl_map = {
319318
0: impl_map_0,
@@ -367,6 +366,7 @@ def __init__(self, *, use_lazy_eval: bool = True):
367366
}
368367
self._expr_impl_map = _populate_expr_impl_map()
369368
self._impl_map_arbitrary_arity = {
369+
"concat": lambda *args: pl.concat_str(args),
370370
"fmax": lambda *args: pl.max(args),
371371
"fmin": lambda *args: pl.min(args),
372372
"maximum": lambda *args: pl.max(args),
@@ -436,13 +436,23 @@ def drop_indices(self, df) -> None:
436436

437437
def bad_column_positions(self, x):
438438
"""
439-
Return vector indicating which entries are bad (null or nan) (vectorized).
439+
Return vector indicating which entries are null (vectorized).
440440
"""
441441
return x.is_null()
442442

443+
def concat_rows(self, frame_list: List):
444+
"""
445+
Concatenate rows from frame_list
446+
"""
447+
frame_list = list(frame_list)
448+
assert len(frame_list) > 0
449+
if len(frame_list) == 1:
450+
return frame_list[0]
451+
pl.concat(frame_list, how="vertical")
452+
443453
def concat_columns(self, frame_list):
444454
"""
445-
Concatinate columns from frame_list
455+
Concatenate columns from frame_list
446456
"""
447457
frame_list = list(frame_list)
448458
if len(frame_list) <= 0:
@@ -677,23 +687,46 @@ def _natural_join_step(self, op: data_algebra.data_ops_types.OperatorPlatform, *
677687
)
678688
inputs = [self._compose_polars_ops(s, data_map=data_map) for s in op.sources]
679689
assert len(inputs) == 2
680-
res = inputs[0].join(
681-
inputs[1],
682-
left_on=op.on_a,
683-
right_on=op.on_b,
684-
how=op.jointype.lower(),
685-
suffix = "_da_right_tmp",
686-
)
687-
coalesce_columns = set(op.sources[0].columns_produced()).intersection(op.sources[1].columns_produced()) - set(op.on_a)
688-
if len(coalesce_columns) > 0:
689-
res = res.with_columns([
690-
pl.when(pl.col(c).is_null())
691-
.then(pl.col(c + "_da_right_tmp"))
692-
.otherwise(pl.col(c))
693-
.alias(c)
694-
for c in coalesce_columns
695-
])
696-
res = res.select(op.columns_produced())
690+
how = op.jointype.lower()
691+
if how == "full":
692+
how = "outer"
693+
coalesce_columns = (
694+
set(op.sources[0].columns_produced()).intersection(op.sources[1].columns_produced())
695+
- set(op.on_a))
696+
if how != "right":
697+
res = inputs[0].join(
698+
inputs[1],
699+
left_on=op.on_a,
700+
right_on=op.on_b,
701+
how=how,
702+
suffix = "_da_right_tmp",
703+
)
704+
if len(coalesce_columns) > 0:
705+
res = res.with_columns([
706+
pl.when(pl.col(c).is_null())
707+
.then(pl.col(c + "_da_right_tmp"))
708+
.otherwise(pl.col(c))
709+
.alias(c)
710+
for c in coalesce_columns
711+
])
712+
else:
713+
# simulate right join with left join
714+
res = inputs[1].join(
715+
inputs[0],
716+
left_on=op.on_b,
717+
right_on=op.on_a,
718+
how="left",
719+
suffix = "_da_left_tmp",
720+
)
721+
if len(coalesce_columns) > 0:
722+
res = res.with_columns([
723+
pl.when(pl.col(c + "_da_left_tmp").is_null())
724+
.then(pl.col(c))
725+
.otherwise(pl.col(c + "_da_left_tmp"))
726+
.alias(c)
727+
for c in coalesce_columns
728+
])
729+
res = res.select(op.columns_produced())
697730
return res
698731

699732
def _order_rows_step(self, op: data_algebra.data_ops_types.OperatorPlatform, *, data_map: Dict[str, Any]):
@@ -1016,6 +1049,7 @@ def act_on_expression(self, *, arg, values: List, op):
10161049
if (f is None):
10171050
if op.op in ["_ngroup", "ngroup"]:
10181051
assert isinstance(arg, pl.DataFrame)
1052+
# n_groups = arg.groupby(["x"]).apply(lambda x: x.head(1)).shape[0]
10191053
raise ValueError(f" {op.op} not implemented for Polars adapter, yet")
10201054
if f is None:
10211055
try:

build/lib/data_algebra/solutions.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ def xicor_score_variables_plan(
224224
assert isinstance(n_rep, int)
225225
record_map = RecordMap(
226226
blocks_out=RecordSpecification(
227-
control_table=data_algebra.data_model.default_data_model().pd.DataFrame(
227+
control_table=data_algebra.data_model.default_data_model().data_frame(
228228
{
229229
"variable_name": x_vars,
230230
"x": x_vars,
@@ -237,7 +237,7 @@ def xicor_score_variables_plan(
237237
),
238238
strict=False,
239239
)
240-
rep_frame = data_algebra.data_model.default_data_model().pd.DataFrame({"rep": range(n_rep)})
240+
rep_frame = data_algebra.data_model.default_data_model().data_frame({"rep": range(n_rep)})
241241
grouped_calc = (
242242
xicor_query(
243243
d
@@ -529,13 +529,12 @@ def replicate_rows_query(
529529
assert power_key_colname not in d.column_names
530530
# get a pandas namespace
531531
local_data_model = data_algebra.data_model.default_data_model()
532-
pd = local_data_model.pd
533532
# build powers of 2 until max_count is met or exceeded
534533
powers = list(range(int(numpy.ceil(numpy.log(max_count) / numpy.log(2))) + 1))
535534
# replicate each power the number of times it specifies
536-
count_frame = pd.concat(
535+
count_frame = local_data_model.concat_rows(
537536
[
538-
pd.DataFrame(
537+
local_data_model.data_frame(
539538
{
540539
power_key_colname: f"p{p}",
541540
seq_column_name: range(int(2**p)),

build/lib/data_algebra/test_util.py

Lines changed: 43 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,7 @@ def check_transform(
553553
cols_case_sensitive: bool = False,
554554
check_row_order: bool = False,
555555
check_parse: bool = True,
556+
try_on_DBs: bool = True,
556557
models_to_skip: Optional[Iterable] = None,
557558
valid_for_empty: bool = True,
558559
empty_produces_empty: bool = True,
@@ -571,6 +572,7 @@ def check_transform(
571572
:param cols_case_sensitive: passed to equivalent_frames()
572573
:param check_row_order: passed to equivalent_frames()
573574
:param check_parse: if True check expression parses/formats to self
575+
:param try_on_DBs: if true, try on databases
574576
:param models_to_skip: None or set of model names or models to skip testing
575577
:param valid_for_empty: logical, if True perform tests on empty inputs
576578
:param empty_produces_empty: logical, if True assume empty inputs should produce empty output
@@ -583,6 +585,7 @@ def check_transform(
583585
cols_used = ops.columns_used()
584586
table_name = [k for k in cols_used.keys()][0]
585587
data = {table_name: data}
588+
assert isinstance(try_on_DBs, bool)
586589
assert isinstance(try_on_Polars, bool)
587590
assert expect is not None
588591
if local_data_model is None:
@@ -619,44 +622,44 @@ def check_transform(
619622
empty_produces_empty=empty_produces_empty,
620623
local_data_model=polars_data_model,
621624
)
622-
623-
caught: Optional[Any] = None
624-
db_handles = [
625-
# non-connected handles, lets us test some of the SQL generation path
626-
data_algebra.SQLite.SQLiteModel().db_handle(None),
627-
data_algebra.BigQuery.BigQueryModel().db_handle(None),
628-
data_algebra.PostgreSQL.PostgreSQLModel().db_handle(None),
629-
data_algebra.SparkSQL.SparkSQLModel().db_handle(None),
630-
data_algebra.MySQL.MySQLModel().db_handle(None),
631-
]
632-
try:
633-
test_dbs = get_test_dbs()
634-
db_handles = db_handles + test_dbs
635-
if models_to_skip is not None:
636-
models_to_skip = {str(m) for m in models_to_skip}
637-
db_handles = [h for h in db_handles if str(h.db_model) not in models_to_skip]
638-
_check_transform_on_handles(
639-
ops=ops,
640-
data=data,
641-
expect=expect,
642-
float_tol=float_tol,
643-
check_column_order=check_column_order,
644-
cols_case_sensitive=cols_case_sensitive,
645-
check_row_order=check_row_order,
646-
db_handles=db_handles,
647-
local_data_model=local_data_model,
648-
)
649-
except AssertionError as ase:
650-
traceback.print_exc()
651-
caught = ase
652-
except Exception as exc:
653-
traceback.print_exc()
654-
caught = exc
655-
for handle in db_handles:
656-
# noinspection PyBroadException
625+
if try_on_DBs:
626+
caught: Optional[Any] = None
627+
db_handles = [
628+
# non-connected handles, lets us test some of the SQL generation path
629+
data_algebra.SQLite.SQLiteModel().db_handle(None),
630+
data_algebra.BigQuery.BigQueryModel().db_handle(None),
631+
data_algebra.PostgreSQL.PostgreSQLModel().db_handle(None),
632+
data_algebra.SparkSQL.SparkSQLModel().db_handle(None),
633+
data_algebra.MySQL.MySQLModel().db_handle(None),
634+
]
657635
try:
658-
handle.close()
659-
except Exception:
660-
pass
661-
if caught is not None:
662-
raise caught
636+
test_dbs = get_test_dbs()
637+
db_handles = db_handles + test_dbs
638+
if models_to_skip is not None:
639+
models_to_skip = {str(m) for m in models_to_skip}
640+
db_handles = [h for h in db_handles if str(h.db_model) not in models_to_skip]
641+
_check_transform_on_handles(
642+
ops=ops,
643+
data=data,
644+
expect=expect,
645+
float_tol=float_tol,
646+
check_column_order=check_column_order,
647+
cols_case_sensitive=cols_case_sensitive,
648+
check_row_order=check_row_order,
649+
db_handles=db_handles,
650+
local_data_model=local_data_model,
651+
)
652+
except AssertionError as ase:
653+
traceback.print_exc()
654+
caught = ase
655+
except Exception as exc:
656+
traceback.print_exc()
657+
caught = exc
658+
for handle in db_handles:
659+
# noinspection PyBroadException
660+
try:
661+
handle.close()
662+
except Exception:
663+
pass
664+
if caught is not None:
665+
raise caught

coverage.txt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ data_algebra/__init__.py 9 0 100%
144144
data_algebra/arrow.py 141 41 71% 45, 52, 56-57, 66, 69, 73, 92, 96, 102, 113-116, 121, 129, 136, 155, 158, 171-172, 203, 212, 221-234, 237-245, 258, 260, 262, 266, 270
145145
data_algebra/cdata.py 287 42 85% 48, 50, 54, 56, 64, 67, 73, 77, 80, 85, 89, 97, 105, 110, 154, 231, 237, 243, 246, 265, 267, 269, 272, 275, 282-284, 299, 310, 325, 357, 364-367, 378, 414, 449, 472, 477, 482, 487, 491
146146
data_algebra/connected_components.py 22 0 100%
147-
data_algebra/data_model.py 60 0 100%
147+
data_algebra/data_model.py 64 0 100%
148148
data_algebra/data_model_space.py 51 7 86% 23, 39-40, 45, 55-56, 62
149149
data_algebra/data_ops.py 1336 201 85% 35-36, 55-56, 93, 129, 223, 279, 338, 375, 377, 379, 381, 385, 447, 492, 523, 550, 584, 590, 592, 677, 679, 728, 751, 757, 772, 774, 785, 797, 825, 827, 840, 842, 848, 860, 863, 875, 878, 891, 893, 895, 897, 908, 910, 946, 962, 982, 984, 986, 988, 992, 1027-1035, 1038-1048, 1064, 1088, 1126-1129, 1134, 1138, 1262, 1267, 1272, 1274, 1281, 1283, 1291, 1297, 1299, 1301, 1304, 1307, 1310, 1315, 1332, 1344, 1359, 1402, 1404, 1406, 1408, 1410, 1412, 1452, 1462, 1477, 1552, 1554, 1564, 1570, 1573, 1591, 1601, 1661, 1663, 1665, 1668, 1679, 1752, 1754, 1790-1794, 1798, 1802, 1814, 1866, 1870, 1873, 1875, 1906-1910, 1914, 1916, 1928, 1978, 1983, 1988, 2019-2023, 2027, 2029, 2040, 2091, 2096, 2102, 2105, 2121-2125, 2131, 2133, 2135, 2137, 2149, 2219, 2267-2271, 2275, 2277, 2279, 2290, 2405-2409, 2413, 2415, 2426, 2504, 2519, 2549, 2569, 2571, 2573, 2575, 2586, 2659, 2665, 2667, 2687-2691, 2700, 2702, 2704, 2706, 2717, 2787, 2803-2807, 2811, 2813, 2859, 2937-2945, 2949, 2951, 2953, 2955, 2961
150150
data_algebra/data_ops_types.py 82 10 88% 325, 339-340, 344-348, 353, 361
@@ -160,17 +160,17 @@ data_algebra/flow_text.py 17 0 100%
160160
data_algebra/near_sql.py 237 3 99% 41, 256-257
161161
data_algebra/op_catalog.py 3 0 100%
162162
data_algebra/op_container.py 127 49 61% 46-47, 63-71, 80-81, 88-89, 92-93, 96, 99, 103-104, 109, 114, 142-143, 146-152, 164-177, 180-183, 186-187, 198-199, 206-207, 214-215, 218-219, 222-226, 232-233, 237, 240, 243, 246
163-
data_algebra/pandas_base.py 718 67 91% 54, 68, 77, 87, 92, 101, 223, 225, 239, 242, 247, 252, 466, 468, 482, 487, 492, 504, 510-517, 524, 558-563, 593, 597, 600, 602, 639, 693, 734, 751, 771, 789, 799, 814, 862, 870, 878, 893, 904, 916, 935, 950, 978, 993, 1030, 1047, 1050, 1061, 1080, 1087, 1116, 1138, 1162, 1167, 1173, 1183, 1282, 1296-1298
163+
data_algebra/pandas_base.py 724 67 91% 54, 68, 77, 87, 92, 101, 223, 225, 239, 242, 247, 252, 467, 478, 492, 497, 502, 514, 520-527, 534, 568-573, 603, 607, 610, 612, 649, 703, 744, 761, 781, 799, 809, 824, 872, 880, 888, 903, 914, 926, 945, 960, 988, 1003, 1040, 1057, 1060, 1071, 1090, 1097, 1126, 1148, 1172, 1177, 1183, 1193, 1292, 1306-1308
164164
data_algebra/pandas_model.py 19 2 89% 32-33
165165
data_algebra/parse_by_lark.py 164 24 85% 71, 93, 108, 129-130, 137, 161, 171, 185-186, 188, 200, 206, 213-217, 245, 253, 263-266
166-
data_algebra/polars_model.py 512 55 89% 130, 139, 189, 401, 417, 427, 434, 449, 451, 465, 470, 475, 524, 542, 558, 622, 638-640, 667, 675, 704, 719, 737, 755, 775, 787-789, 792, 797, 799, 806-818, 825, 830, 861, 890, 899, 927, 942, 954, 1018-1019, 1028-1029, 1031
166+
data_algebra/polars_model.py 525 60 89% 130, 139, 189, 401, 417, 427, 434, 447-451, 459, 461, 475, 480, 485, 534, 552, 568, 632, 648-650, 677, 685, 737, 752, 770, 788, 808, 820-822, 825, 830, 832, 839-851, 858, 863, 894, 923, 932, 960, 975, 987, 1051-1053, 1062-1063, 1065
167167
data_algebra/python3_lark.py 1 0 100%
168-
data_algebra/solutions.py 136 4 97% 63, 308, 389, 472
168+
data_algebra/solutions.py 135 4 97% 63, 308, 389, 472
169169
data_algebra/sql_format_options.py 15 2 87% 61, 69
170-
data_algebra/test_util.py 331 62 81% 28-29, 104, 126, 136, 139, 143, 166, 169, 173, 175-178, 189, 246-247, 263-268, 272, 284, 286-294, 331, 333, 344, 352, 363, 370, 376, 388, 399, 413, 468, 472, 523-526, 528-531, 533-536, 538-541, 649-654, 659-660, 662
170+
data_algebra/test_util.py 333 62 81% 28-29, 104, 126, 136, 139, 143, 166, 169, 173, 175-178, 189, 246-247, 263-268, 272, 284, 286-294, 331, 333, 344, 352, 363, 370, 376, 388, 399, 413, 468, 472, 523-526, 528-531, 533-536, 538-541, 652-657, 662-663, 665
171171
data_algebra/util.py 127 28 78% 26, 59-60, 63-64, 67-68, 71-72, 75-76, 79-80, 83-84, 87-88, 91-92, 95-96, 143, 165, 167, 182, 223, 227, 229
172172
--------------------------------------------------------------------
173-
TOTAL 6755 939 86%
173+
TOTAL 6779 944 86%
174174

175175

176-
======================= 359 passed in 849.12s (0:14:09) ========================
176+
======================= 359 passed in 893.27s (0:14:53) ========================
294 Bytes
Binary file not shown.

dist/data_algebra-1.6.0.tar.gz

311 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)