Skip to content

Commit 69baaa9

Browse files
authored
Merge pull request #23 from rich-iannone/feat-get-column-row-count
feat: add the `get_column_count()` and `get_row_count()` functions
2 parents b3fb756 + cef171a commit 69baaa9

File tree

4 files changed

+233
-3
lines changed

4 files changed

+233
-3
lines changed

docs/_quarto.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,4 +120,6 @@ quartodoc:
120120
contents:
121121
- name: load_dataset
122122
- name: preview
123+
- name: get_column_count
124+
- name: get_row_count
123125
- name: config

pointblank/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from pointblank.validate import Validate, load_dataset, config
2424
from pointblank.schema import Schema
2525
from pointblank.thresholds import Thresholds
26-
from pointblank.preview import preview
26+
from pointblank.preview import preview, get_column_count, get_row_count
2727

2828
__all__ = [
2929
"TF",
@@ -41,4 +41,6 @@
4141
"load_dataset",
4242
"config",
4343
"preview",
44+
"get_column_count",
45+
"get_row_count",
4446
]

pointblank/preview.py

Lines changed: 167 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pointblank.validate import _create_table_type_html, _create_table_dims_html
1111
from pointblank._utils import _get_tbl_type, _check_any_df_lib, _select_df_lib
1212

13-
__all__ = ["preview"]
13+
__all__ = ["preview", "get_column_count", "get_row_count"]
1414

1515

1616
def preview(
@@ -498,3 +498,169 @@ def _select_columns(
498498
if tbl_type == "polars":
499499
return data.select(resolved_columns)
500500
return data[resolved_columns]
501+
502+
503+
def get_column_count(data: FrameT | Any) -> int:
504+
"""
505+
Get the number of columns in a table.
506+
507+
The `get_column_count()` function returns the number of columns in a table. The function works
508+
with any table that is supported by the `pointblank` library, including Pandas, Polars, and Ibis
509+
backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.).
510+
511+
Parameters
512+
----------
513+
data
514+
The table for which to get the column count, which could be a DataFrame object or an Ibis
515+
table object. Read the *Supported Input Table Types* section for details on the supported
516+
table types.
517+
518+
Returns
519+
-------
520+
int
521+
The number of columns in the table.
522+
523+
Supported Input Table Types
524+
---------------------------
525+
The `data=` parameter can be given any of the following table types:
526+
527+
- Polars DataFrame (`"polars"`)
528+
- Pandas DataFrame (`"pandas"`)
529+
- DuckDB table (`"duckdb"`)*
530+
- MySQL table (`"mysql"`)*
531+
- PostgreSQL table (`"postgresql"`)*
532+
- SQLite table (`"sqlite"`)*
533+
- Parquet table (`"parquet"`)*
534+
535+
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
536+
`ibis.expr.types.relations.Table`). Furthermore, using `get_column_count()` with these types of
537+
tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a
538+
Polars or Pandas DataFrame, the availability of Ibis is not needed.
539+
540+
Examples
541+
--------
542+
To get the number of columns in a table, we can use the `get_column_count()` function. Here's an
543+
example using the `small_table` dataset (itself loaded using the `load_dataset()` function):
544+
545+
```{python}
546+
import pointblank as pb
547+
548+
small_table_polars = pb.load_dataset("small_table")
549+
550+
pb.get_column_count(small_table_polars)
551+
```
552+
553+
This table is a Polars DataFrame, but the `get_column_count()` function works with any table
554+
supported by `pointblank`, including Pandas DataFrames and Ibis backend tables. Here's an
555+
example using a DuckDB table handled by Ibis:
556+
557+
```{python}
558+
small_table_duckdb = pb.load_dataset("small_table", tbl_type="duckdb")
559+
560+
pb.get_column_count(small_table_duckdb)
561+
```
562+
563+
The function always returns the number of columns in the table as an integer value, which is
564+
`8` for the `small_table` dataset.
565+
"""
566+
567+
if "ibis.expr.types.relations.Table" in str(type(data)):
568+
return len(data.columns)
569+
570+
elif "polars" in str(type(data)):
571+
return len(data.columns)
572+
573+
elif "pandas" in str(type(data)):
574+
return data.shape[1]
575+
576+
else:
577+
raise ValueError("The input table type supplied in `data=` is not supported.")
578+
579+
580+
def get_row_count(data: FrameT | Any) -> int:
581+
"""
582+
Get the number of rows in a table.
583+
584+
The `get_row_count()` function returns the number of rows in a table. The function works with
585+
any table that is supported by the `pointblank` library, including Pandas, Polars, and Ibis
586+
backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.).
587+
588+
Parameters
589+
----------
590+
data
591+
The table for which to get the row count, which could be a DataFrame object or an Ibis table
592+
object. Read the *Supported Input Table Types* section for details on the supported table
593+
types.
594+
595+
Returns
596+
-------
597+
int
598+
The number of rows in the table.
599+
600+
Supported Input Table Types
601+
---------------------------
602+
The `data=` parameter can be given any of the following table types:
603+
604+
- Polars DataFrame (`"polars"`)
605+
- Pandas DataFrame (`"pandas"`)
606+
- DuckDB table (`"duckdb"`)*
607+
- MySQL table (`"mysql"`)*
608+
- PostgreSQL table (`"postgresql"`)*
609+
- SQLite table (`"sqlite"`)*
610+
- Parquet table (`"parquet"`)*
611+
612+
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
613+
`ibis.expr.types.relations.Table`). Furthermore, using `get_row_count()` with these types of
614+
tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a
615+
Polars or Pandas DataFrame, the availability of Ibis is not needed.
616+
617+
Examples
618+
--------
619+
Getting the number of rows in a table is easily done by using the `get_row_count()` function.
620+
Here's an example using the `game_revenue` dataset (itself loaded using the `load_dataset()`
621+
function):
622+
623+
```{python}
624+
import pointblank as pb
625+
626+
game_revenue_polars = pb.load_dataset("game_revenue")
627+
628+
pb.get_row_count(game_revenue_polars)
629+
```
630+
631+
This table is a Polars DataFrame, but the `get_row_count()` function works with any table
632+
supported by `pointblank`, including Pandas DataFrames and Ibis backend tables. Here's an
633+
example using a DuckDB table handled by Ibis:
634+
635+
```{python}
636+
game_revenue_duckdb = pb.load_dataset("game_revenue", tbl_type="duckdb")
637+
638+
pb.get_row_count(game_revenue_duckdb)
639+
```
640+
641+
The function always returns the number of rows in the table as an integer value, which is `2000`
642+
for the `game_revenue` dataset.
643+
"""
644+
645+
if "ibis.expr.types.relations.Table" in str(type(data)):
646+
647+
# Determine whether Pandas or Polars is available to get the row count
648+
_check_any_df_lib(method_used="get_row_count")
649+
650+
# Select the DataFrame library to use for displaying the Ibis table
651+
df_lib = _select_df_lib(preference="polars")
652+
df_lib_name = df_lib.__name__
653+
654+
if df_lib_name == "pandas":
655+
return int(data.count().to_pandas())
656+
else:
657+
return int(data.count().to_polars())
658+
659+
elif "polars" in str(type(data)):
660+
return int(data.height)
661+
662+
elif "pandas" in str(type(data)):
663+
return data.shape[0]
664+
665+
else:
666+
raise ValueError("The input table type supplied in `data=` is not supported.")

tests/test_preview.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
first_n,
1414
last_n,
1515
)
16-
from pointblank.preview import preview
16+
from pointblank.preview import preview, get_column_count, get_row_count
1717
from pointblank.validate import load_dataset
1818

1919

@@ -134,3 +134,63 @@ def test_preview_with_columns_subset_failing(tbl_type):
134134
preview(tbl, columns_subset=["fake_id", "item_name", "item_revenue"])
135135
with pytest.raises(ValueError):
136136
preview(tbl, columns_subset=col(matches("fake_id")))
137+
138+
139+
@pytest.mark.parametrize("tbl_type", ["pandas", "polars", "duckdb"])
140+
def test_get_column_count(tbl_type):
141+
142+
small_table = load_dataset(dataset="small_table", tbl_type=tbl_type)
143+
game_revenue = load_dataset(dataset="game_revenue", tbl_type=tbl_type)
144+
145+
assert get_column_count(small_table) == 8
146+
assert get_column_count(game_revenue) == 11
147+
148+
149+
def test_get_column_count_failing():
150+
151+
with pytest.raises(ValueError):
152+
get_column_count(None)
153+
with pytest.raises(ValueError):
154+
get_column_count("not a table")
155+
156+
157+
@pytest.mark.parametrize("tbl_type", ["pandas", "polars", "duckdb"])
158+
def test_get_row_count(tbl_type):
159+
160+
small_table = load_dataset(dataset="small_table", tbl_type=tbl_type)
161+
game_revenue = load_dataset(dataset="game_revenue", tbl_type=tbl_type)
162+
163+
assert get_row_count(small_table) == 13
164+
assert get_row_count(game_revenue) == 2000
165+
166+
167+
def test_get_row_count_failing():
168+
169+
with pytest.raises(ValueError):
170+
get_row_count(None)
171+
with pytest.raises(ValueError):
172+
get_row_count("not a table")
173+
174+
175+
def test_get_row_count_no_polars_duckdb_table():
176+
177+
small_table = load_dataset(dataset="small_table", tbl_type="duckdb")
178+
179+
# Mock the absence of the Polars library, which is the default library for making
180+
# a table for the preview; this should not raise an error since Pandas is the
181+
# fallback library and is available
182+
with patch.dict(sys.modules, {"polars": None}):
183+
assert get_row_count(small_table) == 13
184+
185+
# Mock the absence of the Pandas library, which is a secondary library for making
186+
# a table for the preview; this should not raise an error since Polars is the default
187+
# library and is available
188+
with patch.dict(sys.modules, {"pandas": None}):
189+
assert get_row_count(small_table) == 13
190+
191+
# Mock the absence of both the Polars and Pandas libraries, which are the libraries
192+
# for making a table for the preview; this should raise an error since there are no
193+
# libraries available to make a table for the preview
194+
with patch.dict(sys.modules, {"polars": None, "pandas": None}):
195+
with pytest.raises(ImportError):
196+
assert get_row_count(small_table) == 13

0 commit comments

Comments
 (0)