WinVector
diff --git a/‎build/lib/data_algebra/data_model.py‎
Lines changed: 12 additions & 0 deletions b/‎build/lib/data_algebra/data_model.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎build/lib/data_algebra/pandas_base.py‎
Lines changed: 14 additions & 4 deletions b/‎build/lib/data_algebra/pandas_base.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎build/lib/data_algebra/polars_model.py‎
Lines changed: 58 additions & 24 deletions b/‎build/lib/data_algebra/polars_model.py‎
Lines changed: 58 additions & 24 deletions
diff --git a/‎build/lib/data_algebra/solutions.py‎
Lines changed: 4 additions & 5 deletions b/‎build/lib/data_algebra/solutions.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎build/lib/data_algebra/test_util.py‎
Lines changed: 43 additions & 40 deletions b/‎build/lib/data_algebra/test_util.py‎
Lines changed: 43 additions & 40 deletions
diff --git a/‎coverage.txt‎
Lines changed: 7 additions & 7 deletions b/‎coverage.txt‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎dist/data_algebra-1.6.0-py3-none-any.whl‎
294 Bytes b/‎dist/data_algebra-1.6.0-py3-none-any.whl‎
294 Bytes
diff --git a/‎dist/data_algebra-1.6.0.tar.gz‎
311 Bytes b/‎dist/data_algebra-1.6.0.tar.gz‎
311 Bytes
@@ -72,6 +72,18 @@ def table_is_keyed_by_columns(self, table, *, column_names: Iterable[str]) -> bo
         :return: True if rows are uniquely keyed by values in named columns
         """
 
+    @abc.abstractmethod
+    def concat_rows(self, frame_list: List):
+        """
+        Concatenate rows from frame_list
+        """
+
+    @abc.abstractmethod
+    def concat_columns(self, frame_list):
+        """
+        Concatenate columns from frame_list
+        """
+
     # evaluate
 
     @abc.abstractmethod
 
@@ -456,14 +456,24 @@ def bad_column_positions(self, x):
                 self.pd.isnull(x), numpy.logical_or(numpy.isnan(x), numpy.isinf(x))
             )
         return self.pd.isnull(x)
+    
+    def concat_rows(self, frame_list: List):
+        """
+        Concatenate rows from frame_list
+        """
+        frame_list = list(frame_list)
+        assert len(frame_list) > 0
+        if len(frame_list) == 1:
+            return self.clean_copy(frame_list[0])
+        res = self.pd.concat(frame_list, axis=0)
+        return res
 
-    def concat_columns(self, frame_list):
+    def concat_columns(self, frame_list: List):
         """
-        Concatinate columns from frame_list
+        Concatenate columns from frame_list
         """
         frame_list = list(frame_list)
-        if len(frame_list) <= 0:
-            return None
+        assert len(frame_list) > 0
         if len(frame_list) == 1:
             return self.clean_copy(frame_list[0])
         res = self.pd.concat(frame_list, axis=1)
 
@@ -229,7 +229,7 @@ def _populate_expr_impl_map() -> Dict[int, Dict[str, Callable]]:
         "as_int64": lambda x: x.cast(int),
         "as_str": lambda x: x.cast(str),
         "base_Sunday": lambda x: x.base_Sunday(),
-        "bfill": lambda x: x.bfill(),
+        "bfill": lambda x: x.fill_null(strategy='backward'),
         "ceil": lambda x: x.ceil(),
         "coalesce0": lambda x: pl.when(x.is_null()).then(pl.col(_da_temp_zero_column_name)).otherwise(x),
         "cos": lambda x: x.cos(),
@@ -246,7 +246,7 @@ def _populate_expr_impl_map() -> Dict[int, Dict[str, Callable]]:
         "dayofyear": lambda x: x.dayofyear(),
         "exp": lambda x: x.exp(),
         "expm1": lambda x: x.expm1(),
-        "ffill": lambda x: x.ffill(),
+        "ffill": lambda x: x.fill_null(strategy='forward'),
         "first": lambda x: x.first(),
         "floor": lambda x: x.floor(),
         "format_date": lambda x: x.format_date(),
@@ -289,7 +289,6 @@ def _populate_expr_impl_map() -> Dict[int, Dict[str, Callable]]:
         "%/%": lambda a, b: a / b,
         "around": lambda a, b: a.round(b),
         "coalesce": lambda a, b: pl.when(a.is_null()).then(b).otherwise(a),
-        "concat": lambda a, b: a.concat(b),
         "date_diff": lambda a, b: a.date_diff(b),
         "is_in": lambda a, b: a.is_in(b),
         "mod": lambda a, b: a % b,
@@ -310,10 +309,10 @@ def _populate_expr_impl_map() -> Dict[int, Dict[str, Callable]]:
         "parse_datetime": lambda x, format : x.cast(str).str.strptime(pl.Datetime, fmt=format, strict=False).cast(pl.Datetime),
     }
     impl_map_3 = {
-        "if_else": lambda a, b, c: pl.when(a).then(b).otherwise(c),
+        "if_else": lambda a, b, c: pl.when(a.is_null()).then(pl.lit(None)).otherwise(pl.when(a).then(b).otherwise(c)),
         "mapv": _mapv,
         "trimstr": lambda a, b, c: a.trimstr(b, c),
-        "where": lambda a, b, c: pl.when(a).then(b).otherwise(c),
+        "where": lambda a, b, c: pl.when(a.is_null()).then(c).otherwise(pl.when(a).then(b).otherwise(c)),
     }
     impl_map = {
         0: impl_map_0,
@@ -367,6 +366,7 @@ def __init__(self, *, use_lazy_eval: bool = True):
         }
         self._expr_impl_map = _populate_expr_impl_map()
         self._impl_map_arbitrary_arity = {
+            "concat": lambda *args: pl.concat_str(args),
             "fmax": lambda *args: pl.max(args),
             "fmin": lambda *args: pl.min(args),
             "maximum": lambda *args: pl.max(args),
@@ -436,13 +436,23 @@ def drop_indices(self, df) -> None:
 
     def bad_column_positions(self, x):
         """
-        Return vector indicating which entries are bad (null or nan) (vectorized).
+        Return vector indicating which entries are null (vectorized).
         """
         return x.is_null()
 
+    def concat_rows(self, frame_list: List):
+        """
+        Concatenate rows from frame_list
+        """
+        frame_list = list(frame_list)
+        assert len(frame_list) > 0
+        if len(frame_list) == 1:
+            return frame_list[0]
+        pl.concat(frame_list, how="vertical")
+
     def concat_columns(self, frame_list):
         """
-        Concatinate columns from frame_list
+        Concatenate columns from frame_list
         """
         frame_list = list(frame_list)
         if len(frame_list) <= 0:
@@ -677,23 +687,46 @@ def _natural_join_step(self, op: data_algebra.data_ops_types.OperatorPlatform, *
             )
         inputs = [self._compose_polars_ops(s, data_map=data_map) for s in op.sources]
         assert len(inputs) == 2
-        res = inputs[0].join(
-            inputs[1],
-            left_on=op.on_a,
-            right_on=op.on_b,
-            how=op.jointype.lower(),
-            suffix = "_da_right_tmp",
-        )
-        coalesce_columns = set(op.sources[0].columns_produced()).intersection(op.sources[1].columns_produced()) - set(op.on_a)
-        if len(coalesce_columns) > 0:
-            res = res.with_columns([
-                pl.when(pl.col(c).is_null())
-                    .then(pl.col(c + "_da_right_tmp"))
-                    .otherwise(pl.col(c))
-                    .alias(c)
-                for c in coalesce_columns
-            ])
-            res = res.select(op.columns_produced())
+        how = op.jointype.lower()
+        if how == "full":
+            how = "outer"
+        coalesce_columns = (
+            set(op.sources[0].columns_produced()).intersection(op.sources[1].columns_produced()) 
+            - set(op.on_a))
+        if how != "right":
+            res = inputs[0].join(
+                inputs[1],
+                left_on=op.on_a,
+                right_on=op.on_b,
+                how=how,
+                suffix = "_da_right_tmp",
+            )
+            if len(coalesce_columns) > 0:
+                res = res.with_columns([
+                    pl.when(pl.col(c).is_null())
+                        .then(pl.col(c + "_da_right_tmp"))
+                        .otherwise(pl.col(c))
+                        .alias(c)
+                    for c in coalesce_columns
+                ])
+        else:
+            # simulate right join with left join
+            res = inputs[1].join(
+                inputs[0],
+                left_on=op.on_b,
+                right_on=op.on_a,
+                how="left",
+                suffix = "_da_left_tmp",
+            )
+            if len(coalesce_columns) > 0:
+                res = res.with_columns([
+                    pl.when(pl.col(c + "_da_left_tmp").is_null())
+                        .then(pl.col(c))
+                        .otherwise(pl.col(c + "_da_left_tmp"))
+                        .alias(c)
+                    for c in coalesce_columns
+                ])
+        res = res.select(op.columns_produced())
         return res
 
     def _order_rows_step(self, op: data_algebra.data_ops_types.OperatorPlatform, *, data_map: Dict[str, Any]):
@@ -1016,6 +1049,7 @@ def act_on_expression(self, *, arg, values: List, op):
         if (f is None): 
             if op.op in ["_ngroup", "ngroup"]:
                 assert isinstance(arg, pl.DataFrame)
+                # n_groups = arg.groupby(["x"]).apply(lambda x: x.head(1)).shape[0]
                 raise ValueError(f" {op.op} not implemented for Polars adapter, yet")
         if f is None:
             try:
 
@@ -224,7 +224,7 @@ def xicor_score_variables_plan(
     assert isinstance(n_rep, int)
     record_map = RecordMap(
         blocks_out=RecordSpecification(
-            control_table=data_algebra.data_model.default_data_model().pd.DataFrame(
+            control_table=data_algebra.data_model.default_data_model().data_frame(
                 {
                     "variable_name": x_vars,
                     "x": x_vars,
@@ -237,7 +237,7 @@ def xicor_score_variables_plan(
         ),
         strict=False,
     )
-    rep_frame = data_algebra.data_model.default_data_model().pd.DataFrame({"rep": range(n_rep)})
+    rep_frame = data_algebra.data_model.default_data_model().data_frame({"rep": range(n_rep)})
     grouped_calc = (
         xicor_query(
             d
@@ -529,13 +529,12 @@ def replicate_rows_query(
     assert power_key_colname not in d.column_names
     # get a pandas namespace
     local_data_model = data_algebra.data_model.default_data_model()
-    pd = local_data_model.pd
     # build powers of 2 until max_count is met or exceeded
     powers = list(range(int(numpy.ceil(numpy.log(max_count) / numpy.log(2))) + 1))
     # replicate each power the number of times it specifies
-    count_frame = pd.concat(
+    count_frame = local_data_model.concat_rows(
         [
-            pd.DataFrame(
+            local_data_model.data_frame(
                 {
                     power_key_colname: f"p{p}",
                     seq_column_name: range(int(2**p)),
 
@@ -553,6 +553,7 @@ def check_transform(
     cols_case_sensitive: bool = False,
     check_row_order: bool = False,
     check_parse: bool = True,
+    try_on_DBs: bool = True,
     models_to_skip: Optional[Iterable] = None,
     valid_for_empty: bool = True,
     empty_produces_empty: bool = True,
@@ -571,6 +572,7 @@ def check_transform(
     :param cols_case_sensitive: passed to equivalent_frames()
     :param check_row_order: passed to equivalent_frames()
     :param check_parse: if True check expression parses/formats to self
+    :param try_on_DBs: if true, try on databases
     :param models_to_skip: None or set of model names or models to skip testing
     :param valid_for_empty: logical, if True perform tests on empty inputs
     :param empty_produces_empty: logical, if True assume empty inputs should produce empty output
@@ -583,6 +585,7 @@ def check_transform(
         cols_used = ops.columns_used()
         table_name = [k for k in cols_used.keys()][0]
         data = {table_name: data}
+    assert isinstance(try_on_DBs, bool)
     assert isinstance(try_on_Polars, bool)
     assert expect is not None
     if local_data_model is None:
@@ -619,44 +622,44 @@ def check_transform(
             empty_produces_empty=empty_produces_empty,
             local_data_model=polars_data_model,
         )
-
-    caught: Optional[Any] = None
-    db_handles = [
-        # non-connected handles, lets us test some of the SQL generation path
-        data_algebra.SQLite.SQLiteModel().db_handle(None),
-        data_algebra.BigQuery.BigQueryModel().db_handle(None),
-        data_algebra.PostgreSQL.PostgreSQLModel().db_handle(None),
-        data_algebra.SparkSQL.SparkSQLModel().db_handle(None),
-        data_algebra.MySQL.MySQLModel().db_handle(None),
-    ]
-    try:
-        test_dbs = get_test_dbs()
-        db_handles = db_handles + test_dbs
-        if models_to_skip is not None:
-            models_to_skip = {str(m) for m in models_to_skip}
-            db_handles = [h for h in db_handles if str(h.db_model) not in models_to_skip]
-        _check_transform_on_handles(
-            ops=ops,
-            data=data,
-            expect=expect,
-            float_tol=float_tol,
-            check_column_order=check_column_order,
-            cols_case_sensitive=cols_case_sensitive,
-            check_row_order=check_row_order,
-            db_handles=db_handles,
-            local_data_model=local_data_model,
-        )
-    except AssertionError as ase:
-        traceback.print_exc()
-        caught = ase
-    except Exception as exc:
-        traceback.print_exc()
-        caught = exc
-    for handle in db_handles:
-        # noinspection PyBroadException
+    if try_on_DBs:
+        caught: Optional[Any] = None
+        db_handles = [
+            # non-connected handles, lets us test some of the SQL generation path
+            data_algebra.SQLite.SQLiteModel().db_handle(None),
+            data_algebra.BigQuery.BigQueryModel().db_handle(None),
+            data_algebra.PostgreSQL.PostgreSQLModel().db_handle(None),
+            data_algebra.SparkSQL.SparkSQLModel().db_handle(None),
+            data_algebra.MySQL.MySQLModel().db_handle(None),
+        ]
         try:
-            handle.close()
-        except Exception:
-            pass
-    if caught is not None:
-        raise caught
+            test_dbs = get_test_dbs()
+            db_handles = db_handles + test_dbs
+            if models_to_skip is not None:
+                models_to_skip = {str(m) for m in models_to_skip}
+                db_handles = [h for h in db_handles if str(h.db_model) not in models_to_skip]
+            _check_transform_on_handles(
+                ops=ops,
+                data=data,
+                expect=expect,
+                float_tol=float_tol,
+                check_column_order=check_column_order,
+                cols_case_sensitive=cols_case_sensitive,
+                check_row_order=check_row_order,
+                db_handles=db_handles,
+                local_data_model=local_data_model,
+            )
+        except AssertionError as ase:
+            traceback.print_exc()
+            caught = ase
+        except Exception as exc:
+            traceback.print_exc()
+            caught = exc
+        for handle in db_handles:
+            # noinspection PyBroadException
+            try:
+                handle.close()
+            except Exception:
+                pass
+        if caught is not None:
+            raise caught
@@ -144,7 +144,7 @@ data_algebra/__init__.py                   9      0   100%
 data_algebra/arrow.py                    141     41    71%   45, 52, 56-57, 66, 69, 73, 92, 96, 102, 113-116, 121, 129, 136, 155, 158, 171-172, 203, 212, 221-234, 237-245, 258, 260, 262, 266, 270
 data_algebra/cdata.py                    287     42    85%   48, 50, 54, 56, 64, 67, 73, 77, 80, 85, 89, 97, 105, 110, 154, 231, 237, 243, 246, 265, 267, 269, 272, 275, 282-284, 299, 310, 325, 357, 364-367, 378, 414, 449, 472, 477, 482, 487, 491
 data_algebra/connected_components.py      22      0   100%
-data_algebra/data_model.py                60      0   100%
+data_algebra/data_model.py                64      0   100%
 data_algebra/data_model_space.py          51      7    86%   23, 39-40, 45, 55-56, 62
 data_algebra/data_ops.py                1336    201    85%   35-36, 55-56, 93, 129, 223, 279, 338, 375, 377, 379, 381, 385, 447, 492, 523, 550, 584, 590, 592, 677, 679, 728, 751, 757, 772, 774, 785, 797, 825, 827, 840, 842, 848, 860, 863, 875, 878, 891, 893, 895, 897, 908, 910, 946, 962, 982, 984, 986, 988, 992, 1027-1035, 1038-1048, 1064, 1088, 1126-1129, 1134, 1138, 1262, 1267, 1272, 1274, 1281, 1283, 1291, 1297, 1299, 1301, 1304, 1307, 1310, 1315, 1332, 1344, 1359, 1402, 1404, 1406, 1408, 1410, 1412, 1452, 1462, 1477, 1552, 1554, 1564, 1570, 1573, 1591, 1601, 1661, 1663, 1665, 1668, 1679, 1752, 1754, 1790-1794, 1798, 1802, 1814, 1866, 1870, 1873, 1875, 1906-1910, 1914, 1916, 1928, 1978, 1983, 1988, 2019-2023, 2027, 2029, 2040, 2091, 2096, 2102, 2105, 2121-2125, 2131, 2133, 2135, 2137, 2149, 2219, 2267-2271, 2275, 2277, 2279, 2290, 2405-2409, 2413, 2415, 2426, 2504, 2519, 2549, 2569, 2571, 2573, 2575, 2586, 2659, 2665, 2667, 2687-2691, 2700, 2702, 2704, 2706, 2717, 2787, 2803-2807, 2811, 2813, 2859, 2937-2945, 2949, 2951, 2953, 2955, 2961
 data_algebra/data_ops_types.py            82     10    88%   325, 339-340, 344-348, 353, 361
@@ -160,17 +160,17 @@ data_algebra/flow_text.py                 17      0   100%
 data_algebra/near_sql.py                 237      3    99%   41, 256-257
 data_algebra/op_catalog.py                 3      0   100%
 data_algebra/op_container.py             127     49    61%   46-47, 63-71, 80-81, 88-89, 92-93, 96, 99, 103-104, 109, 114, 142-143, 146-152, 164-177, 180-183, 186-187, 198-199, 206-207, 214-215, 218-219, 222-226, 232-233, 237, 240, 243, 246
-data_algebra/pandas_base.py              718     67    91%   54, 68, 77, 87, 92, 101, 223, 225, 239, 242, 247, 252, 466, 468, 482, 487, 492, 504, 510-517, 524, 558-563, 593, 597, 600, 602, 639, 693, 734, 751, 771, 789, 799, 814, 862, 870, 878, 893, 904, 916, 935, 950, 978, 993, 1030, 1047, 1050, 1061, 1080, 1087, 1116, 1138, 1162, 1167, 1173, 1183, 1282, 1296-1298
+data_algebra/pandas_base.py              724     67    91%   54, 68, 77, 87, 92, 101, 223, 225, 239, 242, 247, 252, 467, 478, 492, 497, 502, 514, 520-527, 534, 568-573, 603, 607, 610, 612, 649, 703, 744, 761, 781, 799, 809, 824, 872, 880, 888, 903, 914, 926, 945, 960, 988, 1003, 1040, 1057, 1060, 1071, 1090, 1097, 1126, 1148, 1172, 1177, 1183, 1193, 1292, 1306-1308
 data_algebra/pandas_model.py              19      2    89%   32-33
 data_algebra/parse_by_lark.py            164     24    85%   71, 93, 108, 129-130, 137, 161, 171, 185-186, 188, 200, 206, 213-217, 245, 253, 263-266
-data_algebra/polars_model.py             512     55    89%   130, 139, 189, 401, 417, 427, 434, 449, 451, 465, 470, 475, 524, 542, 558, 622, 638-640, 667, 675, 704, 719, 737, 755, 775, 787-789, 792, 797, 799, 806-818, 825, 830, 861, 890, 899, 927, 942, 954, 1018-1019, 1028-1029, 1031
+data_algebra/polars_model.py             525     60    89%   130, 139, 189, 401, 417, 427, 434, 447-451, 459, 461, 475, 480, 485, 534, 552, 568, 632, 648-650, 677, 685, 737, 752, 770, 788, 808, 820-822, 825, 830, 832, 839-851, 858, 863, 894, 923, 932, 960, 975, 987, 1051-1053, 1062-1063, 1065
 data_algebra/python3_lark.py               1      0   100%
-data_algebra/solutions.py                136      4    97%   63, 308, 389, 472
+data_algebra/solutions.py                135      4    97%   63, 308, 389, 472
 data_algebra/sql_format_options.py        15      2    87%   61, 69
-data_algebra/test_util.py                331     62    81%   28-29, 104, 126, 136, 139, 143, 166, 169, 173, 175-178, 189, 246-247, 263-268, 272, 284, 286-294, 331, 333, 344, 352, 363, 370, 376, 388, 399, 413, 468, 472, 523-526, 528-531, 533-536, 538-541, 649-654, 659-660, 662
+data_algebra/test_util.py                333     62    81%   28-29, 104, 126, 136, 139, 143, 166, 169, 173, 175-178, 189, 246-247, 263-268, 272, 284, 286-294, 331, 333, 344, 352, 363, 370, 376, 388, 399, 413, 468, 472, 523-526, 528-531, 533-536, 538-541, 652-657, 662-663, 665
 data_algebra/util.py                     127     28    78%   26, 59-60, 63-64, 67-68, 71-72, 75-76, 79-80, 83-84, 87-88, 91-92, 95-96, 143, 165, 167, 182, 223, 227, 229
 --------------------------------------------------------------------
-TOTAL                                   6755    939    86%
+TOTAL                                   6779    944    86%
 
 
-======================= 359 passed in 849.12s (0:14:09) ========================
+======================= 359 passed in 893.27s (0:14:53) ========================