Skip to content
Closed
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
41e6ad2
feat: Add configurable display options for PyDataFrame
kosiew Mar 28, 2025
17d54cd
feat: Enhance DisplayConfig for DataFrame with customizable options
kosiew Mar 28, 2025
fd8f5a1
feat: Add display configuration methods to DataFrame class
kosiew Mar 28, 2025
5aae267
feat: Add display configuration tests for DataFrame
kosiew Mar 28, 2025
bb4516f
feat: Validate display configuration values in DataFrame
kosiew Mar 28, 2025
ca908f0
collect_record_batches_to_display without debug
kosiew Mar 28, 2025
727914d
Add tests for display_config
kosiew Mar 28, 2025
52091ce
fix: Update record batch display logic to use min_table_rows from config
kosiew Mar 28, 2025
da116bf
reuse _create_numeric_test_df
kosiew Mar 28, 2025
ee1de81
feat: Add max_table_rows_in_repr to control row display in DataFrame
kosiew Mar 28, 2025
929563a
tidy up comments, tests
kosiew Mar 28, 2025
cae89b0
Fix ruff errors
kosiew Mar 28, 2025
1bfa8b1
Trigger CI
kosiew Mar 28, 2025
f34a331
Fix ruff errors
kosiew Mar 28, 2025
cb151e3
fix: Simplify error handling in display_config method
kosiew Mar 28, 2025
0d5e900
refactor: Update display configuration handling in DataFrame
kosiew Mar 31, 2025
ba5acc4
Revert "refactor: Update display configuration handling in DataFrame"
kosiew Mar 31, 2025
0e30af3
Refactor PyDataFrame: Simplify methods and improve performance
kosiew Mar 31, 2025
a5d224f
Revert "Refactor PyDataFrame: Simplify methods and improve performance"
kosiew Mar 31, 2025
30c9d99
revert to before DisplayConfig in PyDataFrame
kosiew Apr 2, 2025
028f0ab
feat: Add DataframeDisplayConfig for customizable DataFrame display o…
kosiew Apr 2, 2025
b401e1a
feat: Add method to configure DataFrame display options in PySessionC…
kosiew Apr 2, 2025
d2a1dc9
feat: Add method to configure DataFrame display options in SessionCon…
kosiew Apr 2, 2025
07d7cf6
rename to PyDataframeDisplayConfig
kosiew Apr 2, 2025
625a1f2
feat: Add DataframeDisplayConfig class for customizable DataFrame dis…
kosiew Apr 2, 2025
5dfb9ce
Fix ruff errors
kosiew Apr 2, 2025
065fa40
feat: Enhance PyDataFrame to support customizable display options
kosiew Apr 2, 2025
7fa2c7c
Amend PyDataFrame to use display_config instead of constants
kosiew Apr 2, 2025
cbc4759
refactor: Simplify PySessionConfig and PySessionContext by removing u…
kosiew Apr 2, 2025
1737973
refactor: Update PyDataFrame methods to consistently use display_conf…
kosiew Apr 2, 2025
354ff45
feat: Add display configuration options to SessionContext for DataFra…
kosiew Apr 2, 2025
984b906
fix: Add validation for display configuration properties in Dataframe…
kosiew Apr 2, 2025
1326d71
feat: Integrate DataframeDisplayConfig into SessionContext initializa…
kosiew Apr 2, 2025
0c4eaa6
test: Add tests for DataframeDisplayConfig initialization and Session…
kosiew Apr 2, 2025
eef0a36
debug: Add logging to collect_record_batches_to_display for better tr…
kosiew Apr 2, 2025
815690b
test: Add display configuration tests for DataFrame representation an…
kosiew Apr 2, 2025
a5e16a3
refactor: Remove debug print statements from display configuration tests
kosiew Apr 2, 2025
efc041c
refactor: Extract validation logic into a separate method in Datafram…
kosiew Apr 3, 2025
d30c641
refactor: Enhance DataframeDisplayConfig initialization with value va…
kosiew Apr 3, 2025
b467100
test: Add fixture for test data and refactor tests to use it
kosiew Apr 3, 2025
2993854
fix: Update loop condition in collect_record_batches_to_display for c…
kosiew Apr 3, 2025
71c64b9
fix ruff errors
kosiew Apr 3, 2025
a878ed4
Merge branch 'main' into dataframe-display-config
kosiew Apr 3, 2025
ec7033a
fix ruff errors
kosiew Apr 3, 2025
ad83fc5
feat: Add optional display_config parameter to SessionContext constru…
kosiew Apr 3, 2025
fb90fbc
fix: Update test data size and improve display config tests
kosiew Apr 3, 2025
73edc6a
fix: Remove unused import of 'dis' in test_dataframe.py
kosiew Apr 3, 2025
f08c070
feat: Add display_config parameter to SessionContext constructor
kosiew Apr 3, 2025
2751759
fix: Increase test data size in data fixture for better coverage
kosiew Apr 3, 2025
c109ad2
docs: Add docstring to normalize_uuid function for clarity in testing
kosiew Apr 3, 2025
f3cdfbe
fix ruff errors
kosiew Apr 3, 2025
2fcc2c1
fix clippy errors
kosiew Apr 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions python/datafusion/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import pyarrow as pa

from datafusion._internal import DataFrame as DataFrameInternal
from datafusion._internal import DisplayConfig
from datafusion._internal import expr as expr_internal

from enum import Enum
Expand Down Expand Up @@ -813,6 +814,60 @@ def count(self) -> int:
"""
return self.df.count()

def configure_display(
self,
max_table_bytes: Optional[int] = None,
min_table_rows: Optional[int] = None,
max_cell_length: Optional[int] = None,
max_table_rows_in_repr: Optional[int] = None,
) -> None:
"""Configure display options for DataFrame representation.

Args:
max_table_bytes: Maximum bytes to display for table presentation
(default: 2MB).
Set to lower value for large tables to limit memory usage.
min_table_rows: Minimum number of table rows to display (default: 20).
This is used for initial display and in notebooks.
max_cell_length: Maximum length of a cell before it gets minimized
(default: 25).
Longer cells will be truncated with an expand button.
max_table_rows_in_repr: Maximum number of rows to display in string
representation
(default: 10).

Raises:
ValueError: If any of the provided values are less than or equal to 0.
"""
if any(
value is not None and value <= 0
for value in (
max_table_bytes,
min_table_rows,
max_cell_length,
max_table_rows_in_repr,
)
):
error_msg = "All values must be greater than 0."
raise ValueError(error_msg)

self.df.configure_display(
max_table_bytes, min_table_rows, max_cell_length, max_table_rows_in_repr
)

def reset_display_config(self) -> None:
"""Reset display configuration to default values."""
self.df.reset_display_config()

@property
def display_config(self) -> DisplayConfig:
"""Get the current display configuration.

Returns:
DisplayConfig: The current display configuration settings
"""
return self.df.display_config

@deprecated("Use :py:func:`unnest_columns` instead.")
def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame:
"""See :py:func:`unnest_columns`."""
Expand Down
281 changes: 281 additions & 0 deletions python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1261,3 +1261,284 @@ def test_dataframe_repr_html(df) -> None:
body_lines = [f"<td(.*?)>{v}</td>" for inner in body_data for v in inner]
body_pattern = "(.*?)".join(body_lines)
assert len(re.findall(body_pattern, output, re.DOTALL)) == 1


def test_display_config(df):
"""Test the display configuration properties are accessible."""
config = df.display_config

# Verify default values
assert config.max_table_bytes == 2 * 1024 * 1024 # 2 MB
assert config.min_table_rows == 20
assert config.max_cell_length == 25
assert config.max_table_rows_in_repr == 10


def test_configure_display(df):
"""Test setting display configuration properties."""
# Modify the display configuration
df.configure_display(
max_table_bytes=1024 * 1024,
min_table_rows=10,
max_cell_length=50,
max_table_rows_in_repr=15,
)

# Verify the changes took effect
config = df.display_config
assert config.max_table_bytes == 1024 * 1024 # 1 MB
assert config.min_table_rows == 10
assert config.max_cell_length == 50
assert config.max_table_rows_in_repr == 15

# Test partial update (only changing one property)
df.configure_display(max_table_rows_in_repr=5)
config = df.display_config
assert config.max_table_bytes == 1024 * 1024 # previous value retained
assert config.min_table_rows == 10 # previous value retained
assert config.max_cell_length == 50 # previous value retained
assert config.max_table_rows_in_repr == 5 # only this value changed

# Test with extreme values
# Zero values
with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
df.configure_display(max_table_bytes=0, min_table_rows=0, max_cell_length=0)

# Test with negative values
# This tests for expected behavior when users accidentally pass negative values
# Since these are usize in Rust, we expect a Python ValueError when trying to pass
# negative values.
with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
df.configure_display(max_table_bytes=-1)

with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
df.configure_display(min_table_rows=-5)

with pytest.raises(ValueError, match=r".*must be greater than 0.*"):
df.configure_display(max_cell_length=-10)

# Reset for next tests
df.reset_display_config()


def test_reset_display_config(df):
"""Test resetting display configuration to defaults."""
# First modify the configuration
df.configure_display(
max_table_bytes=1024 * 1024,
min_table_rows=10,
max_cell_length=50,
max_table_rows_in_repr=15,
)

# Verify changes took effect
config = df.display_config
assert config.max_table_bytes == 1024 * 1024
assert config.min_table_rows == 10
assert config.max_cell_length == 50
assert config.max_table_rows_in_repr == 15

# Now reset to defaults
df.reset_display_config()

# Verify defaults are restored
config = df.display_config
assert config.max_table_bytes == 2 * 1024 * 1024 # 2 MB
assert config.min_table_rows == 20
assert config.max_cell_length == 25
assert config.max_table_rows_in_repr == 10


def test_min_table_rows_display(ctx):
"""Test that at least min_table_rows rows are displayed."""
# Create a dataframe with more rows than the default min_table_rows
rows = 100
df = _create_numeric_test_df(ctx, rows)

# Set min_table_rows to a specific value
custom_min_rows = 30
df.configure_display(min_table_rows=custom_min_rows)

# Get HTML representation
html_output = df._repr_html_()

# Count table rows in the HTML (excluding header row)
# Each row has a <tr> tag
row_count = html_output.count("<tr>") - 1 # subtract 1 for the header row

# Verify at least min_table_rows rows are displayed
assert row_count >= custom_min_rows, (
f"Expected at least {custom_min_rows} rows, got {row_count}"
)

# If data was truncated, "Data truncated" message should be present
if row_count < rows:
assert "Data truncated" in html_output


def test_max_table_bytes_display(ctx):
"""Test that reducing max_table_bytes limits the amount of data displayed."""
# Create a dataframe with large string values to consume memory
# Each string is approximately 1000 bytes
large_strings = ["x" * 1000 for _ in range(50)]
batch = pa.RecordBatch.from_arrays([pa.array(large_strings)], names=["large_data"])
df = ctx.create_dataframe([[batch]])

# First test with default settings
default_html = df._repr_html_()
default_row_count = default_html.count("<tr>") - 1 # subtract header row

# Now set a very small max_table_bytes
df.configure_display(max_table_bytes=5000) # 5KB should only fit a few rows
limited_html = df._repr_html_()
limited_row_count = limited_html.count("<tr>") - 1

# Verify fewer rows are displayed with the byte limit
assert limited_row_count < default_row_count, (
f"Expected fewer rows with byte limit. "
f"Default: {default_row_count}, Limited: {limited_row_count}"
)

# "Data truncated" should be present when limited
assert "Data truncated" in limited_html


def test_max_cell_length_display(ctx):
"""Test that cells longer than max_cell_length are truncated in display."""
# Create a dataframe with long string values
long_strings = [
"short",
"medium text",
"this is a very long string that should be truncated",
]
batch = pa.RecordBatch.from_arrays([pa.array(long_strings)], names=["text"])
df = ctx.create_dataframe([[batch]])

# Set a small max_cell_length
max_length = 10
df.configure_display(max_cell_length=max_length)

# Get HTML representation
html_output = df._repr_html_()

# Check for expand button for long text
assert "expandable-container" in html_output

# Check that expandable class is used for long text
assert 'class="expandable"' in html_output

# Look for the truncated text and expand button
long_text = long_strings[2]
assert long_text[:max_length] in html_output # Truncated text should be present
assert "expand-btn" in html_output # Expand button should be present
assert long_text in html_output # Full text should also be in the HTML (hidden)


def test_display_config_repr_string(ctx):
"""Test that __repr__ respects display configuration."""
# Create a dataframe with more rows than we want to show
# df.__repr__ returns max 10 rows by default, so we start test with 7 rows
rows = 7
df = _create_numeric_test_df(ctx, rows)

# Configure to show at least 5 rows in string representation
min_table_rows_in_display = 5
df.configure_display(min_table_rows=min_table_rows_in_display)

# Get the string representation
repr_str = df.__repr__()

# Count the number of rows using helper function
lines_count = _count_lines_in_str(repr_str)

assert lines_count >= min_table_rows_in_display

# Now set min_rows higher and see if more rows appear
min_table_rows_in_display = 7
rows = 11
df = _create_numeric_test_df(ctx, rows) # Recreate to reset the state
df.configure_display(min_table_rows=min_table_rows_in_display)

repr_str_more = df.__repr__()
# The string should contain "Data truncated"
assert "Data truncated" in repr_str_more

# Count lines again
lines_count2 = _count_lines_in_str(repr_str_more)

# Should show more rows now
assert lines_count2 > lines_count
assert lines_count2 >= min_table_rows_in_display


def _count_lines_in_str(repr_str: str) -> int:
"""Count the number of rows displayed in a string representation.

Args:
repr_str: String representation of the DataFrame.

Returns:
Number of rows that appear in the string representation.
"""
# DataFrame tables are formatted with | value | patterns
# Count lines that match actual data rows (not headers or separators)
value_lines = 0
for line in repr_str.split("\n"):
# Look for lines like "| 0 |", "| 1 |", etc.
if re.search(r"\|\s*\d+\s*\|", line):
value_lines += 1
return value_lines


def _create_numeric_test_df(ctx, rows) -> DataFrame:
"""Create a test dataframe with numeric values from 0 to rows-1.

Args:
ctx: SessionContext to use for creating the dataframe.
rows: Number of rows to create.

Returns:
DataFrame with a single column "values" containing numbers 0 to rows-1.
"""
data = list(range(rows))
batch = pa.RecordBatch.from_arrays([pa.array(data)], names=["values"])
return ctx.create_dataframe([[batch]])


def test_max_table_rows_in_repr(ctx):
"""Test that max_table_rows_in_repr controls the number of rows in string
representation.
"""
# Create a dataframe with more rows than the default max_table_rows_in_repr (10)
rows = 20
df = _create_numeric_test_df(ctx, rows)

# First test with default setting (should limit to 10 rows)
repr_str = df.__repr__()
lines_default = _count_lines_in_str(repr_str)

# Default should be 10 rows max
assert lines_default <= 10
assert "Data truncated" in repr_str

# Now set a custom max_table_rows_in_repr value
custom_max_rows = 15
df.configure_display(max_table_rows_in_repr=custom_max_rows)

# Get the string representation with new configuration
repr_str_more = df.__repr__()
lines_custom = _count_lines_in_str(repr_str_more)

# Should show more rows than default but not more than configured max
assert lines_custom > lines_default
assert lines_custom <= custom_max_rows
assert "Data truncated" in repr_str_more

# Now set max_rows higher than total rows - should show all rows
df.configure_display(max_table_rows_in_repr=25)
repr_str_all = df.__repr__()
lines_all = _count_lines_in_str(repr_str_all)

# Should show all rows (20)
assert lines_all == rows
assert "Data truncated" not in repr_str_all
Loading