diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst index a78fd8073..11e3d7e72 100644 --- a/docs/source/user-guide/dataframe.rst +++ b/docs/source/user-guide/dataframe.rst @@ -75,13 +75,17 @@ You can customize how DataFrames are rendered in HTML by configuring the formatt # Change the default styling configure_formatter( - max_rows=50, # Maximum number of rows to display - max_width=None, # Maximum width in pixels (None for auto) - theme="light", # Theme: "light" or "dark" - precision=2, # Floating point precision - thousands_separator=",", # Separator for thousands - date_format="%Y-%m-%d", # Date format - truncate_width=20 # Max width for string columns before truncating + max_cell_length=25, # Maximum characters in a cell before truncation + max_width=1000, # Maximum width in pixels + max_height=300, # Maximum height in pixels + max_memory_bytes=2097152, # Maximum memory for rendering (2MB) + min_rows_display=20, # Minimum number of rows to display + repr_rows=10, # Number of rows to display in __repr__ + enable_cell_expansion=True,# Allow expanding truncated cells + custom_css=None, # Additional custom CSS + show_truncation_message=True, # Show message when data is truncated + style_provider=None, # Custom styling provider + use_shared_styles=True # Share styles across tables ) The formatter settings affect all DataFrames displayed after configuration. @@ -113,6 +117,25 @@ For advanced styling needs, you can create a custom style provider: # Apply the custom style provider configure_formatter(style_provider=MyStyleProvider()) +Performance Optimization with Shared Styles +------------------------------------------- +The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying +multiple DataFrames in notebook environments: + + .. code-block:: python + from datafusion.html_formatter import StyleProvider, configure_formatter + # Default: Use shared styles (recommended for notebooks) + configure_formatter(use_shared_styles=True) + + # Disable shared styles (each DataFrame includes its own styles) + configure_formatter(use_shared_styles=False) + +When ``use_shared_styles=True``: +- CSS styles and JavaScript are included only once per notebook session +- This reduces HTML output size and prevents style duplication +- Improves rendering performance with many DataFrames +- Applies consistent styling across all DataFrames + Creating a Custom Formatter --------------------------- @@ -177,3 +200,18 @@ You can also use a context manager to temporarily change formatting settings: # Back to default formatting df.show() + +Memory and Display Controls +--------------------------- + +You can control how much data is displayed and how much memory is used for rendering: + + .. code-block:: python + + configure_formatter( + max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display + min_rows_display=50, # Always show at least 50 rows + repr_rows=20 # Show 20 rows in __repr__ output + ) + +These parameters help balance comprehensive data display against performance considerations. \ No newline at end of file diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py index a50e14fd5..12a7e4553 100644 --- a/python/datafusion/html_formatter.py +++ b/python/datafusion/html_formatter.py @@ -27,6 +27,36 @@ ) +def _validate_positive_int(value: Any, param_name: str) -> None: + """Validate that a parameter is a positive integer. + + Args: + value: The value to validate + param_name: Name of the parameter (used in error message) + + Raises: + ValueError: If the value is not a positive integer + """ + if not isinstance(value, int) or value <= 0: + msg = f"{param_name} must be a positive integer" + raise ValueError(msg) + + +def _validate_bool(value: Any, param_name: str) -> None: + """Validate that a parameter is a boolean. + + Args: + value: The value to validate + param_name: Name of the parameter (used in error message) + + Raises: + TypeError: If the value is not a boolean + """ + if not isinstance(value, bool): + msg = f"{param_name} must be a boolean" + raise TypeError(msg) + + @runtime_checkable class CellFormatter(Protocol): """Protocol for cell value formatters.""" @@ -91,6 +121,9 @@ class DataFrameHtmlFormatter: max_cell_length: Maximum characters to display in a cell before truncation max_width: Maximum width of the HTML table in pixels max_height: Maximum height of the HTML table in pixels + max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) + min_rows_display: Minimum number of rows to display + repr_rows: Default number of rows to display in repr output enable_cell_expansion: Whether to add expand/collapse buttons for long cell values custom_css: Additional CSS to include in the HTML output @@ -108,6 +141,9 @@ def __init__( max_cell_length: int = 25, max_width: int = 1000, max_height: int = 300, + max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB + min_rows_display: int = 20, + repr_rows: int = 10, enable_cell_expansion: bool = True, custom_css: Optional[str] = None, show_truncation_message: bool = True, @@ -124,6 +160,12 @@ def __init__( Maximum width of the displayed table in pixels. max_height : int, default 300 Maximum height of the displayed table in pixels. + max_memory_bytes : int, default 2097152 (2MB) + Maximum memory in bytes for rendered data. + min_rows_display : int, default 20 + Minimum number of rows to display. + repr_rows : int, default 10 + Default number of rows to display in repr output. enable_cell_expansion : bool, default True Whether to allow cells to expand when clicked. custom_css : str, optional @@ -139,7 +181,8 @@ def __init__( Raises: ------ ValueError - If max_cell_length, max_width, or max_height is not a positive integer. + If max_cell_length, max_width, max_height, max_memory_bytes, + min_rows_display, or repr_rows is not a positive integer. TypeError If enable_cell_expansion, show_truncation_message, or use_shared_styles is not a boolean, @@ -148,27 +191,17 @@ def __init__( protocol. """ # Validate numeric parameters - - if not isinstance(max_cell_length, int) or max_cell_length <= 0: - msg = "max_cell_length must be a positive integer" - raise ValueError(msg) - if not isinstance(max_width, int) or max_width <= 0: - msg = "max_width must be a positive integer" - raise ValueError(msg) - if not isinstance(max_height, int) or max_height <= 0: - msg = "max_height must be a positive integer" - raise ValueError(msg) + _validate_positive_int(max_cell_length, "max_cell_length") + _validate_positive_int(max_width, "max_width") + _validate_positive_int(max_height, "max_height") + _validate_positive_int(max_memory_bytes, "max_memory_bytes") + _validate_positive_int(min_rows_display, "min_rows_display") + _validate_positive_int(repr_rows, "repr_rows") # Validate boolean parameters - if not isinstance(enable_cell_expansion, bool): - msg = "enable_cell_expansion must be a boolean" - raise TypeError(msg) - if not isinstance(show_truncation_message, bool): - msg = "show_truncation_message must be a boolean" - raise TypeError(msg) - if not isinstance(use_shared_styles, bool): - msg = "use_shared_styles must be a boolean" - raise TypeError(msg) + _validate_bool(enable_cell_expansion, "enable_cell_expansion") + _validate_bool(show_truncation_message, "show_truncation_message") + _validate_bool(use_shared_styles, "use_shared_styles") # Validate custom_css if custom_css is not None and not isinstance(custom_css, str): @@ -183,6 +216,9 @@ def __init__( self.max_cell_length = max_cell_length self.max_width = max_width self.max_height = max_height + self.max_memory_bytes = max_memory_bytes + self.min_rows_display = min_rows_display + self.repr_rows = repr_rows self.enable_cell_expansion = enable_cell_expansion self.custom_css = custom_css self.show_truncation_message = show_truncation_message @@ -597,6 +633,9 @@ def configure_formatter(**kwargs: Any) -> None: **kwargs: Formatter configuration parameters like max_cell_length, max_width, max_height, enable_cell_expansion, etc. + Raises: + ValueError: If any invalid parameters are provided + Example: >>> from datafusion.html_formatter import configure_formatter >>> configure_formatter( @@ -606,6 +645,31 @@ def configure_formatter(**kwargs: Any) -> None: ... use_shared_styles=True ... ) """ + # Valid parameters accepted by DataFrameHtmlFormatter + valid_params = { + "max_cell_length", + "max_width", + "max_height", + "max_memory_bytes", + "min_rows_display", + "repr_rows", + "enable_cell_expansion", + "custom_css", + "show_truncation_message", + "style_provider", + "use_shared_styles", + } + + # Check for invalid parameters + invalid_params = set(kwargs) - valid_params + if invalid_params: + msg = ( + f"Invalid formatter parameters: {', '.join(invalid_params)}. " + f"Valid parameters are: {', '.join(valid_params)}" + ) + raise ValueError(msg) + + # Create and set formatter with validated parameters set_formatter(DataFrameHtmlFormatter(**kwargs)) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 464b884db..e01308c86 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -41,6 +41,8 @@ ) from pyarrow.csv import write_csv +MB = 1024 * 1024 + @pytest.fixture def ctx(): @@ -117,6 +119,31 @@ def clean_formatter_state(): reset_formatter() +# custom style for testing with html formatter +class CustomStyleProvider: + def get_cell_style(self) -> str: + return ( + "background-color: #f5f5f5; color: #333; padding: 8px; border: " + "1px solid #ddd;" + ) + + def get_header_style(self) -> str: + return ( + "background-color: #4285f4; color: white; font-weight: bold; " + "padding: 10px; border: 1px solid #3367d6;" + ) + + +def count_table_rows(html_content: str) -> int: + """Count the number of table rows in HTML content. + Args: + html_content: HTML string to analyze + Returns: + Number of table rows found (number of tags) + """ + return len(re.findall(r" str: - return ( - "background-color: #f5f5f5; color: #333; padding: 8px; border: " - "1px solid #ddd;" - ) - - def get_header_style(self) -> str: - return ( - "background-color: #4285f4; color: white; font-weight: bold; " - "padding: 10px; border: 1px solid #3367d6;" - ) - # Configure with custom style provider configure_formatter(style_provider=CustomStyleProvider()) @@ -917,6 +930,141 @@ def get_header_style(self) -> str: assert "color: #5af" in html_output # Even numbers +def test_html_formatter_memory(df, clean_formatter_state): + """Test the memory and row control parameters in DataFrameHtmlFormatter.""" + configure_formatter(max_memory_bytes=10, min_rows_display=1) + html_output = df._repr_html_() + + # Count the number of table rows in the output + tr_count = count_table_rows(html_output) + # With a tiny memory limit of 10 bytes, the formatter should display + # the minimum number of rows (1) plus a message about truncation + assert tr_count == 2 # 1 for header row, 1 for data row + assert "data truncated" in html_output.lower() + + configure_formatter(max_memory_bytes=10 * MB, min_rows_display=1) + html_output = df._repr_html_() + # With larger memory limit and min_rows=2, should display all rows + tr_count = count_table_rows(html_output) + # Table should have header row (1) + 3 data rows = 4 rows + assert tr_count == 4 + # No truncation message should appear + assert "data truncated" not in html_output.lower() + + +def test_html_formatter_repr_rows(df, clean_formatter_state): + configure_formatter(min_rows_display=2, repr_rows=2) + html_output = df._repr_html_() + + tr_count = count_table_rows(html_output) + # Tabe should have header row (1) + 2 data rows = 3 rows + assert tr_count == 3 + + configure_formatter(min_rows_display=2, repr_rows=3) + html_output = df._repr_html_() + + tr_count = count_table_rows(html_output) + # Tabe should have header row (1) + 3 data rows = 4 rows + assert tr_count == 4 + + +def test_html_formatter_validation(): + # Test validation for invalid parameters + + with pytest.raises(ValueError, match="max_cell_length must be a positive integer"): + DataFrameHtmlFormatter(max_cell_length=0) + + with pytest.raises(ValueError, match="max_width must be a positive integer"): + DataFrameHtmlFormatter(max_width=0) + + with pytest.raises(ValueError, match="max_height must be a positive integer"): + DataFrameHtmlFormatter(max_height=0) + + with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"): + DataFrameHtmlFormatter(max_memory_bytes=0) + + with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"): + DataFrameHtmlFormatter(max_memory_bytes=-100) + + with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): + DataFrameHtmlFormatter(min_rows_display=0) + + with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): + DataFrameHtmlFormatter(min_rows_display=-5) + + with pytest.raises(ValueError, match="repr_rows must be a positive integer"): + DataFrameHtmlFormatter(repr_rows=0) + + with pytest.raises(ValueError, match="repr_rows must be a positive integer"): + DataFrameHtmlFormatter(repr_rows=-10) + + +def test_configure_formatter(df, clean_formatter_state): + """Test using custom style providers with the HTML formatter and configured + parameters.""" + + # these are non-default values + max_cell_length = 10 + max_width = 500 + max_height = 30 + max_memory_bytes = 3 * MB + min_rows_display = 2 + repr_rows = 2 + enable_cell_expansion = False + show_truncation_message = False + use_shared_styles = False + + reset_formatter() + formatter_default = get_formatter() + + assert formatter_default.max_cell_length != max_cell_length + assert formatter_default.max_width != max_width + assert formatter_default.max_height != max_height + assert formatter_default.max_memory_bytes != max_memory_bytes + assert formatter_default.min_rows_display != min_rows_display + assert formatter_default.repr_rows != repr_rows + assert formatter_default.enable_cell_expansion != enable_cell_expansion + assert formatter_default.show_truncation_message != show_truncation_message + assert formatter_default.use_shared_styles != use_shared_styles + + # Configure with custom style provider and additional parameters + configure_formatter( + max_cell_length=max_cell_length, + max_width=max_width, + max_height=max_height, + max_memory_bytes=max_memory_bytes, + min_rows_display=min_rows_display, + repr_rows=repr_rows, + enable_cell_expansion=enable_cell_expansion, + show_truncation_message=show_truncation_message, + use_shared_styles=use_shared_styles, + ) + formatter_custom = get_formatter() + assert formatter_custom.max_cell_length == max_cell_length + assert formatter_custom.max_width == max_width + assert formatter_custom.max_height == max_height + assert formatter_custom.max_memory_bytes == max_memory_bytes + assert formatter_custom.min_rows_display == min_rows_display + assert formatter_custom.repr_rows == repr_rows + assert formatter_custom.enable_cell_expansion == enable_cell_expansion + assert formatter_custom.show_truncation_message == show_truncation_message + assert formatter_custom.use_shared_styles == use_shared_styles + + +def test_configure_formatter_invalid_params(clean_formatter_state): + """Test that configure_formatter rejects invalid parameters.""" + with pytest.raises(ValueError, match="Invalid formatter parameters"): + configure_formatter(invalid_param=123) + + # Test with multiple parameters, one valid and one invalid + with pytest.raises(ValueError, match="Invalid formatter parameters"): + configure_formatter(max_width=500, not_a_real_param="test") + + # Test with multiple invalid parameters + with pytest.raises(ValueError, match="Invalid formatter parameters"): + configure_formatter(fake_param1="test", fake_param2=456) + + def test_get_dataframe(tmp_path): ctx = SessionContext() @@ -1505,9 +1653,8 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: assert result["new_col"] == [3 for _i in range(3)] -def test_dataframe_repr_html_structure(df) -> None: +def test_dataframe_repr_html_structure(df, clean_formatter_state) -> None: """Test that DataFrame._repr_html_ produces expected HTML output structure.""" - import re output = df._repr_html_() @@ -1537,7 +1684,7 @@ def test_dataframe_repr_html_structure(df) -> None: assert len(body_matches) == 1, "Expected pattern of values not found in HTML output" -def test_dataframe_repr_html_values(df): +def test_dataframe_repr_html_values(df, clean_formatter_state): """Test that DataFrame._repr_html_ contains the expected data values.""" html = df._repr_html_() assert html is not None diff --git a/src/dataframe.rs b/src/dataframe.rs index 787f63520..211e31bd1 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -71,8 +71,103 @@ impl PyTableProvider { PyTable::new(table_provider) } } -const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB -const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20; + +/// Configuration for DataFrame display formatting +#[derive(Debug, Clone)] +pub struct FormatterConfig { + /// Maximum memory in bytes to use for display (default: 2MB) + pub max_bytes: usize, + /// Minimum number of rows to display (default: 20) + pub min_rows: usize, + /// Number of rows to include in __repr__ output (default: 10) + pub repr_rows: usize, +} + +impl Default for FormatterConfig { + fn default() -> Self { + Self { + max_bytes: 2 * 1024 * 1024, // 2MB + min_rows: 20, + repr_rows: 10, + } + } +} + +impl FormatterConfig { + /// Validates that all configuration values are positive integers. + /// + /// # Returns + /// + /// `Ok(())` if all values are valid, or an `Err` with a descriptive error message. + pub fn validate(&self) -> Result<(), String> { + if self.max_bytes == 0 { + return Err("max_bytes must be a positive integer".to_string()); + } + + if self.min_rows == 0 { + return Err("min_rows must be a positive integer".to_string()); + } + + if self.repr_rows == 0 { + return Err("repr_rows must be a positive integer".to_string()); + } + + Ok(()) + } +} + +/// Holds the Python formatter and its configuration +struct PythonFormatter<'py> { + /// The Python formatter object + formatter: Bound<'py, PyAny>, + /// The formatter configuration + config: FormatterConfig, +} + +/// Get the Python formatter and its configuration +fn get_python_formatter_with_config(py: Python) -> PyResult { + let formatter = import_python_formatter(py)?; + let config = build_formatter_config_from_python(&formatter)?; + Ok(PythonFormatter { formatter, config }) +} + +/// Get the Python formatter from the datafusion.html_formatter module +fn import_python_formatter(py: Python) -> PyResult> { + let formatter_module = py.import("datafusion.html_formatter")?; + let get_formatter = formatter_module.getattr("get_formatter")?; + get_formatter.call0() +} + +// Helper function to extract attributes with fallback to default +fn get_attr<'a, T>(py_object: &'a Bound<'a, PyAny>, attr_name: &str, default_value: T) -> T +where + T: for<'py> pyo3::FromPyObject<'py> + Clone, +{ + py_object + .getattr(attr_name) + .and_then(|v| v.extract::()) + .unwrap_or_else(|_| default_value.clone()) +} + +/// Helper function to create a FormatterConfig from a Python formatter object +fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult { + let default_config = FormatterConfig::default(); + let max_bytes = get_attr(formatter, "max_memory_bytes", default_config.max_bytes); + let min_rows = get_attr(formatter, "min_rows_display", default_config.min_rows); + let repr_rows = get_attr(formatter, "repr_rows", default_config.repr_rows); + + let config = FormatterConfig { + max_bytes, + min_rows, + repr_rows, + }; + + // Return the validated config, converting String error to PyErr + config + .validate() + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e))?; + Ok(config) +} /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. @@ -114,9 +209,14 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { + // Get the Python formatter config + let PythonFormatter { + formatter: _, + config, + } = get_python_formatter_with_config(py)?; let (batches, has_more) = wait_for_future( py, - collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10), + collect_record_batches_to_display(self.df.as_ref().clone(), config), )?; if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below @@ -135,13 +235,11 @@ impl PyDataFrame { } fn _repr_html_(&self, py: Python) -> PyDataFusionResult { + // Get the Python formatter and config + let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; let (batches, has_more) = wait_for_future( py, - collect_record_batches_to_display( - self.df.as_ref().clone(), - MIN_TABLE_ROWS_TO_DISPLAY, - usize::MAX, - ), + collect_record_batches_to_display(self.df.as_ref().clone(), config), )?; if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below @@ -158,12 +256,6 @@ impl PyDataFrame { let py_schema = self.schema().into_pyobject(py)?; - // Get the Python formatter module and call format_html - let formatter_module = py.import("datafusion.html_formatter")?; - let get_formatter = formatter_module.getattr("get_formatter")?; - let formatter = get_formatter.call0()?; - - // Call format_html method on the formatter let kwargs = pyo3::types::PyDict::new(py); let py_batches_list = PyList::new(py, py_batches.as_slice())?; kwargs.set_item("batches", py_batches_list)?; @@ -796,9 +888,14 @@ fn record_batch_into_schema( /// rows, set min_rows == max_rows. async fn collect_record_batches_to_display( df: DataFrame, - min_rows: usize, - max_rows: usize, + config: FormatterConfig, ) -> Result<(Vec, bool), DataFusionError> { + let FormatterConfig { + max_bytes, + min_rows, + repr_rows, + } = config; + let partitioned_stream = df.execute_stream_partitioned().await?; let mut stream = futures::stream::iter(partitioned_stream).flatten(); let mut size_estimate_so_far = 0; @@ -806,9 +903,8 @@ async fn collect_record_batches_to_display( let mut record_batches = Vec::default(); let mut has_more = false; - while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows) - || rows_so_far < min_rows - { + // ensure minimum rows even if memory/row limits are hit + while (size_estimate_so_far < max_bytes && rows_so_far < repr_rows) || rows_so_far < min_rows { let mut rb = match stream.next().await { None => { break; @@ -821,8 +917,8 @@ async fn collect_record_batches_to_display( if rows_in_rb > 0 { size_estimate_so_far += rb.get_array_memory_size(); - if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY { - let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32; + if size_estimate_so_far > max_bytes { + let ratio = max_bytes as f32 / size_estimate_so_far as f32; let total_rows = rows_in_rb + rows_so_far; let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; @@ -838,8 +934,8 @@ async fn collect_record_batches_to_display( } } - if rows_in_rb + rows_so_far > max_rows { - rb = rb.slice(0, max_rows - rows_so_far); + if rows_in_rb + rows_so_far > repr_rows { + rb = rb.slice(0, repr_rows - rows_so_far); has_more = true; }