diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index ecf5545bc..60d0d61b4 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -45,6 +45,7 @@ Expr, WindowFrame, ) +from .html_formatter import configure_formatter from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream @@ -76,6 +77,7 @@ "col", "column", "common", + "configure_formatter", "expr", "functions", "lit", diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py new file mode 100644 index 000000000..a50e14fd5 --- /dev/null +++ b/python/datafusion/html_formatter.py @@ -0,0 +1,647 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""HTML formatting utilities for DataFusion DataFrames.""" + +from __future__ import annotations + +from typing import ( + Any, + Callable, + Optional, + Protocol, + runtime_checkable, +) + + +@runtime_checkable +class CellFormatter(Protocol): + """Protocol for cell value formatters.""" + + def __call__(self, value: Any) -> str: + """Format a cell value to string representation.""" + ... + + +@runtime_checkable +class StyleProvider(Protocol): + """Protocol for HTML style providers.""" + + def get_cell_style(self) -> str: + """Get the CSS style for table cells.""" + ... + + def get_header_style(self) -> str: + """Get the CSS style for header cells.""" + ... + + +class DefaultStyleProvider: + """Default implementation of StyleProvider.""" + + def get_cell_style(self) -> str: + """Get the CSS style for table cells. + + Returns: + CSS style string + """ + return ( + "border: 1px solid black; padding: 8px; text-align: left; " + "white-space: nowrap;" + ) + + def get_header_style(self) -> str: + """Get the CSS style for header cells. + + Returns: + CSS style string + """ + return ( + "border: 1px solid black; padding: 8px; text-align: left; " + "background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; " + "max-width: fit-content;" + ) + + +class DataFrameHtmlFormatter: + """Configurable HTML formatter for DataFusion DataFrames. + + This class handles the HTML rendering of DataFrames for display in + Jupyter notebooks and other rich display contexts. + + This class supports extension through composition. Key extension points: + - Provide a custom StyleProvider for styling cells and headers + - Register custom formatters for specific types + - Provide custom cell builders for specialized cell rendering + + Args: + max_cell_length: Maximum characters to display in a cell before truncation + max_width: Maximum width of the HTML table in pixels + max_height: Maximum height of the HTML table in pixels + enable_cell_expansion: Whether to add expand/collapse buttons for long cell + values + custom_css: Additional CSS to include in the HTML output + show_truncation_message: Whether to display a message when data is truncated + style_provider: Custom provider for cell and header styles + use_shared_styles: Whether to load styles and scripts only once per notebook + session + """ + + # Class variable to track if styles have been loaded in the notebook + _styles_loaded = False + + def __init__( + self, + max_cell_length: int = 25, + max_width: int = 1000, + max_height: int = 300, + enable_cell_expansion: bool = True, + custom_css: Optional[str] = None, + show_truncation_message: bool = True, + style_provider: Optional[StyleProvider] = None, + use_shared_styles: bool = True, + ) -> None: + """Initialize the HTML formatter. + + Parameters + ---------- + max_cell_length : int, default 25 + Maximum length of cell content before truncation. + max_width : int, default 1000 + Maximum width of the displayed table in pixels. + max_height : int, default 300 + Maximum height of the displayed table in pixels. + enable_cell_expansion : bool, default True + Whether to allow cells to expand when clicked. + custom_css : str, optional + Custom CSS to apply to the HTML table. + show_truncation_message : bool, default True + Whether to show a message indicating that content has been truncated. + style_provider : StyleProvider, optional + Provider of CSS styles for the HTML table. If None, DefaultStyleProvider + is used. + use_shared_styles : bool, default True + Whether to use shared styles across multiple tables. + + Raises: + ------ + ValueError + If max_cell_length, max_width, or max_height is not a positive integer. + TypeError + If enable_cell_expansion, show_truncation_message, or use_shared_styles is + not a boolean, + or if custom_css is provided but is not a string, + or if style_provider is provided but does not implement the StyleProvider + protocol. + """ + # Validate numeric parameters + + if not isinstance(max_cell_length, int) or max_cell_length <= 0: + msg = "max_cell_length must be a positive integer" + raise ValueError(msg) + if not isinstance(max_width, int) or max_width <= 0: + msg = "max_width must be a positive integer" + raise ValueError(msg) + if not isinstance(max_height, int) or max_height <= 0: + msg = "max_height must be a positive integer" + raise ValueError(msg) + + # Validate boolean parameters + if not isinstance(enable_cell_expansion, bool): + msg = "enable_cell_expansion must be a boolean" + raise TypeError(msg) + if not isinstance(show_truncation_message, bool): + msg = "show_truncation_message must be a boolean" + raise TypeError(msg) + if not isinstance(use_shared_styles, bool): + msg = "use_shared_styles must be a boolean" + raise TypeError(msg) + + # Validate custom_css + if custom_css is not None and not isinstance(custom_css, str): + msg = "custom_css must be None or a string" + raise TypeError(msg) + + # Validate style_provider + if style_provider is not None and not isinstance(style_provider, StyleProvider): + msg = "style_provider must implement the StyleProvider protocol" + raise TypeError(msg) + + self.max_cell_length = max_cell_length + self.max_width = max_width + self.max_height = max_height + self.enable_cell_expansion = enable_cell_expansion + self.custom_css = custom_css + self.show_truncation_message = show_truncation_message + self.style_provider = style_provider or DefaultStyleProvider() + self.use_shared_styles = use_shared_styles + # Registry for custom type formatters + self._type_formatters: dict[type, CellFormatter] = {} + # Custom cell builders + self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None + self._custom_header_builder: Optional[Callable[[Any], str]] = None + + def register_formatter(self, type_class: type, formatter: CellFormatter) -> None: + """Register a custom formatter for a specific data type. + + Args: + type_class: The type to register a formatter for + formatter: Function that takes a value of the given type and returns + a formatted string + """ + self._type_formatters[type_class] = formatter + + def set_custom_cell_builder( + self, builder: Callable[[Any, int, int, str], str] + ) -> None: + """Set a custom cell builder function. + + Args: + builder: Function that takes (value, row, col, table_id) and returns HTML + """ + self._custom_cell_builder = builder + + def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None: + """Set a custom header builder function. + + Args: + builder: Function that takes a field and returns HTML + """ + self._custom_header_builder = builder + + @classmethod + def is_styles_loaded(cls) -> bool: + """Check if HTML styles have been loaded in the current session. + + This method is primarily intended for debugging UI rendering issues + related to style loading. + + Returns: + True if styles have been loaded, False otherwise + + Example: + >>> from datafusion.html_formatter import DataFrameHtmlFormatter + >>> DataFrameHtmlFormatter.is_styles_loaded() + False + """ + return cls._styles_loaded + + def format_html( + self, + batches: list, + schema: Any, + has_more: bool = False, + table_uuid: str | None = None, + ) -> str: + """Format record batches as HTML. + + This method is used by DataFrame's _repr_html_ implementation and can be + called directly when custom HTML rendering is needed. + + Args: + batches: List of Arrow RecordBatch objects + schema: Arrow Schema object + has_more: Whether there are more batches not shown + table_uuid: Unique ID for the table, used for JavaScript interactions + + Returns: + HTML string representation of the data + + Raises: + TypeError: If schema is invalid and no batches are provided + """ + if not batches: + return "No data to display" + + # Validate schema + if schema is None or not hasattr(schema, "__iter__"): + msg = "Schema must be provided" + raise TypeError(msg) + + # Generate a unique ID if none provided + table_uuid = table_uuid or f"df-{id(batches)}" + + # Build HTML components + html = [] + + # Only include styles and scripts if: + # 1. Not using shared styles, OR + # 2. Using shared styles but they haven't been loaded yet + include_styles = ( + not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded + ) + + if include_styles: + html.extend(self._build_html_header()) + # If we're using shared styles, mark them as loaded + if self.use_shared_styles: + DataFrameHtmlFormatter._styles_loaded = True + + html.extend(self._build_table_container_start()) + + # Add table header and body + html.extend(self._build_table_header(schema)) + html.extend(self._build_table_body(batches, table_uuid)) + + html.append("") + html.append("") + + # Add footer (JavaScript and messages) + if include_styles and self.enable_cell_expansion: + html.append(self._get_javascript()) + + # Always add truncation message if needed (independent of styles) + if has_more and self.show_truncation_message: + html.append("
Data truncated due to size.
") + + return "\n".join(html) + + def _build_html_header(self) -> list[str]: + """Build the HTML header with CSS styles.""" + html = [] + html.append("") + return html + + def _build_table_container_start(self) -> list[str]: + """Build the opening tags for the table container.""" + html = [] + html.append( + f'
' + ) + html.append('') + return html + + def _build_table_header(self, schema: Any) -> list[str]: + """Build the HTML table header with column names.""" + html = [] + html.append("") + html.append("") + for field in schema: + if self._custom_header_builder: + html.append(self._custom_header_builder(field)) + else: + html.append( + f"" + ) + html.append("") + html.append("") + return html + + def _build_table_body(self, batches: list, table_uuid: str) -> list[str]: + """Build the HTML table body with data rows.""" + html = [] + html.append("") + + row_count = 0 + for batch in batches: + for row_idx in range(batch.num_rows): + row_count += 1 + html.append("") + + for col_idx, column in enumerate(batch.columns): + # Get the raw value from the column + raw_value = self._get_cell_value(column, row_idx) + + # Always check for type formatters first to format the value + formatted_value = self._format_cell_value(raw_value) + + # Then apply either custom cell builder or standard cell formatting + if self._custom_cell_builder: + # Pass both the raw value and formatted value to let the + # builder decide + cell_html = self._custom_cell_builder( + raw_value, row_count, col_idx, table_uuid + ) + html.append(cell_html) + else: + # Standard cell formatting with formatted value + if ( + len(str(raw_value)) > self.max_cell_length + and self.enable_cell_expansion + ): + cell_html = self._build_expandable_cell( + formatted_value, row_count, col_idx, table_uuid + ) + else: + cell_html = self._build_regular_cell(formatted_value) + html.append(cell_html) + + html.append("") + + html.append("") + return html + + def _get_cell_value(self, column: Any, row_idx: int) -> Any: + """Extract a cell value from a column. + + Args: + column: Arrow array + row_idx: Row index + + Returns: + The raw cell value + """ + try: + value = column[row_idx] + + if hasattr(value, "as_py"): + return value.as_py() + except (AttributeError, TypeError): + pass + else: + return value + + def _format_cell_value(self, value: Any) -> str: + """Format a cell value for display. + + Uses registered type formatters if available. + + Args: + value: The cell value to format + + Returns: + Formatted cell value as string + """ + # Check for custom type formatters + for type_cls, formatter in self._type_formatters.items(): + if isinstance(value, type_cls): + return formatter(value) + + # If no formatter matched, return string representation + return str(value) + + def _build_expandable_cell( + self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str + ) -> str: + """Build an expandable cell for long content.""" + short_value = str(formatted_value)[: self.max_cell_length] + return ( + f"" + ) + + def _build_regular_cell(self, formatted_value: str) -> str: + """Build a regular table cell.""" + return ( + f"" + ) + + def _build_html_footer(self, has_more: bool) -> list[str]: + """Build the HTML footer with JavaScript and messages.""" + html = [] + + # Add JavaScript for interactivity only if cell expansion is enabled + # and we're not using the shared styles approach + if self.enable_cell_expansion and not self.use_shared_styles: + html.append(self._get_javascript()) + + # Add truncation message if needed + if has_more and self.show_truncation_message: + html.append("
Data truncated due to size.
") + + return html + + def _get_default_css(self) -> str: + """Get default CSS styles for the HTML table.""" + return """ + .expandable-container { + display: inline-block; + max-width: 200px; + } + .expandable { + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + display: block; + } + .full-text { + display: none; + white-space: normal; + } + .expand-btn { + cursor: pointer; + color: blue; + text-decoration: underline; + border: none; + background: none; + font-size: inherit; + display: block; + margin-top: 5px; + } + """ + + def _get_javascript(self) -> str: + """Get JavaScript code for interactive elements.""" + return """ + + """ + + +class FormatterManager: + """Manager class for the global DataFrame HTML formatter instance.""" + + _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter() + + @classmethod + def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + """ + cls._default_formatter = formatter + _refresh_formatter_reference() + + @classmethod + def get_formatter(cls) -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + Returns: + The global HTML formatter instance + """ + return cls._default_formatter + + +def get_formatter() -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + This function is used by the DataFrame._repr_html_ implementation to access + the shared formatter instance. It can also be used directly when custom + HTML rendering is needed. + + Returns: + The global HTML formatter instance + + Example: + >>> from datafusion.html_formatter import get_formatter + >>> formatter = get_formatter() + >>> formatter.max_cell_length = 50 # Increase cell length + """ + return FormatterManager.get_formatter() + + +def set_formatter(formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + + Example: + >>> from datafusion.html_formatter import get_formatter, set_formatter + >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) + >>> set_formatter(custom_formatter) + """ + FormatterManager.set_formatter(formatter) + + +def configure_formatter(**kwargs: Any) -> None: + """Configure the global DataFrame HTML formatter. + + This function creates a new formatter with the provided configuration + and sets it as the global formatter for all DataFrames. + + Args: + **kwargs: Formatter configuration parameters like max_cell_length, + max_width, max_height, enable_cell_expansion, etc. + + Example: + >>> from datafusion.html_formatter import configure_formatter + >>> configure_formatter( + ... max_cell_length=50, + ... max_height=500, + ... enable_cell_expansion=True, + ... use_shared_styles=True + ... ) + """ + set_formatter(DataFrameHtmlFormatter(**kwargs)) + + +def reset_formatter() -> None: + """Reset the global DataFrame HTML formatter to default settings. + + This function creates a new formatter with default configuration + and sets it as the global formatter for all DataFrames. + + Example: + >>> from datafusion.html_formatter import reset_formatter + >>> reset_formatter() # Reset formatter to default settings + """ + formatter = DataFrameHtmlFormatter() + # Reset the styles_loaded flag to ensure styles will be reloaded + DataFrameHtmlFormatter._styles_loaded = False + set_formatter(formatter) + + +def reset_styles_loaded_state() -> None: + """Reset the styles loaded state to force reloading of styles. + + This can be useful when switching between notebook sessions or + when styles need to be refreshed. + + Example: + >>> from datafusion.html_formatter import reset_styles_loaded_state + >>> reset_styles_loaded_state() # Force styles to reload in next render + """ + DataFrameHtmlFormatter._styles_loaded = False + + +def _refresh_formatter_reference() -> None: + """Refresh formatter reference in any modules using it. + + This helps ensure that changes to the formatter are reflected in existing + DataFrames that might be caching the formatter reference. + """ + # This is a no-op but signals modules to refresh their reference diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index eda13930d..464b884db 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -28,8 +28,17 @@ column, literal, ) -from datafusion import functions as f +from datafusion import ( + functions as f, +) from datafusion.expr import Window +from datafusion.html_formatter import ( + DataFrameHtmlFormatter, + configure_formatter, + get_formatter, + reset_formatter, + reset_styles_loaded_state, +) from pyarrow.csv import write_csv @@ -102,6 +111,12 @@ def partitioned_df(): return ctx.create_dataframe([[batch]]) +@pytest.fixture +def clean_formatter_state(): + """Reset the HTML formatter after each test.""" + reset_formatter() + + def test_select(df): df_1 = df.select( column("a") + column("b"), @@ -656,6 +671,252 @@ def test_window_frame_defaults_match_postgres(partitioned_df): assert df_2.sort(col_a).to_pydict() == expected +def test_html_formatter_configuration(df, clean_formatter_state): + """Test configuring the HTML formatter with different options.""" + # Configure with custom settings + configure_formatter( + max_cell_length=5, + max_width=500, + max_height=200, + enable_cell_expansion=False, + ) + + html_output = df._repr_html_() + + # Verify our configuration was applied + assert "max-height: 200px" in html_output + assert "max-width: 500px" in html_output + # With cell expansion disabled, we shouldn't see expandable-container elements + assert "expandable-container" not in html_output + + +def test_html_formatter_custom_style_provider(df, clean_formatter_state): + """Test using custom style providers with the HTML formatter.""" + + class CustomStyleProvider: + def get_cell_style(self) -> str: + return ( + "background-color: #f5f5f5; color: #333; padding: 8px; border: " + "1px solid #ddd;" + ) + + def get_header_style(self) -> str: + return ( + "background-color: #4285f4; color: white; font-weight: bold; " + "padding: 10px; border: 1px solid #3367d6;" + ) + + # Configure with custom style provider + configure_formatter(style_provider=CustomStyleProvider()) + + html_output = df._repr_html_() + + # Verify our custom styles were applied + assert "background-color: #4285f4" in html_output + assert "color: white" in html_output + assert "background-color: #f5f5f5" in html_output + + +def test_html_formatter_type_formatters(df, clean_formatter_state): + """Test registering custom type formatters for specific data types.""" + + # Get current formatter and register custom formatters + formatter = get_formatter() + + # Format integers with color based on value + # Using int as the type for the formatter will work since we convert + # Arrow scalar values to Python native types in _get_cell_value + def format_int(value): + return f' 2 else "blue"}">{value}' + + formatter.register_formatter(int, format_int) + + html_output = df._repr_html_() + + # Our test dataframe has values 1,2,3 so we should see: + assert '1' in html_output + + +def test_html_formatter_custom_cell_builder(df, clean_formatter_state): + """Test using a custom cell builder function.""" + + # Create a custom cell builder with distinct styling for different value ranges + def custom_cell_builder(value, row, col, table_id): + try: + num_value = int(value) + if num_value > 5: # Values > 5 get green background with indicator + return ( + '' + ) + if num_value < 3: # Values < 3 get blue background with indicator + return ( + '' + ) + except (ValueError, TypeError): + pass + + # Default styling for other cells (3, 4, 5) + return f'' + + # Set our custom cell builder + formatter = get_formatter() + formatter.set_custom_cell_builder(custom_cell_builder) + + html_output = df._repr_html_() + + # Extract cells with specific styling using regex + low_cells = re.findall( + r'', html_output + ) + mid_cells = re.findall( + r'', html_output + ) + high_cells = re.findall( + r'', html_output + ) + + # Sort the extracted values for consistent comparison + low_cells = sorted(map(int, low_cells)) + mid_cells = sorted(map(int, mid_cells)) + high_cells = sorted(map(int, high_cells)) + + # Verify specific values have the correct styling applied + assert low_cells == [1, 2] # Values < 3 + assert mid_cells == [3, 4, 5, 5] # Values 3-5 + assert high_cells == [6, 8, 8] # Values > 5 + + # Verify the exact content with styling appears in the output + assert ( + '' + in html_output + ) + assert ( + '' + in html_output + ) + assert ( + '' in html_output + ) + assert ( + '' in html_output + ) + assert ( + '' + in html_output + ) + assert ( + '' + in html_output + ) + + # Count occurrences to ensure all cells are properly styled + assert html_output.count("-low") == 2 # Two low values (1, 2) + assert html_output.count("-mid") == 4 # Four mid values (3, 4, 5, 5) + assert html_output.count("-high") == 3 # Three high values (6, 8, 8) + + # Create a custom cell builder that changes background color based on value + def custom_cell_builder(value, row, col, table_id): + # Handle numeric values regardless of their exact type + try: + num_value = int(value) + if num_value > 5: # Values > 5 get green background + return f'' + if num_value < 3: # Values < 3 get light blue background + return f'' + except (ValueError, TypeError): + pass + + # Default styling for other cells + return f'' + + # Set our custom cell builder + formatter = get_formatter() + formatter.set_custom_cell_builder(custom_cell_builder) + + html_output = df._repr_html_() + + # Verify our custom cell styling was applied + assert "background-color: #d3e9f0" in html_output # For values 1,2 + + +def test_html_formatter_custom_header_builder(df, clean_formatter_state): + """Test using a custom header builder function.""" + + # Create a custom header builder with tooltips + def custom_header_builder(field): + tooltips = { + "a": "Primary key column", + "b": "Secondary values", + "c": "Additional data", + } + tooltip = tooltips.get(field.name, "") + return ( + f'' + ) + + # Set our custom header builder + formatter = get_formatter() + formatter.set_custom_header_builder(custom_header_builder) + + html_output = df._repr_html_() + + # Verify our custom headers were applied + assert 'title="Primary key column"' in html_output + assert 'title="Secondary values"' in html_output + assert "background-color: #333; color: white" in html_output + + +def test_html_formatter_complex_customization(df, clean_formatter_state): + """Test combining multiple customization options together.""" + + # Create a dark mode style provider + class DarkModeStyleProvider: + def get_cell_style(self) -> str: + return ( + "background-color: #222; color: #eee; " + "padding: 8px; border: 1px solid #444;" + ) + + def get_header_style(self) -> str: + return ( + "background-color: #111; color: #fff; padding: 10px; " + "border: 1px solid #333;" + ) + + # Configure with dark mode style + configure_formatter( + max_cell_length=10, + style_provider=DarkModeStyleProvider(), + custom_css=""" + .datafusion-table { + font-family: monospace; + border-collapse: collapse; + } + .datafusion-table tr:hover td { + background-color: #444 !important; + } + """, + ) + + # Add type formatters for special formatting - now working with native int values + formatter = get_formatter() + formatter.register_formatter( + int, + lambda n: f'{n}', + ) + + html_output = df._repr_html_() + + # Verify our customizations were applied + assert "background-color: #222" in html_output + assert "background-color: #111" in html_output + assert ".datafusion-table" in html_output + assert "color: #5af" in html_output # Even numbers + + def test_get_dataframe(tmp_path): ctx = SessionContext() @@ -1244,7 +1505,10 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: assert result["new_col"] == [3 for _i in range(3)] -def test_dataframe_repr_html(df) -> None: +def test_dataframe_repr_html_structure(df) -> None: + """Test that DataFrame._repr_html_ produces expected HTML output structure.""" + import re + output = df._repr_html_() # Since we've added a fair bit of processing to the html output, lets just verify @@ -1255,9 +1519,131 @@ def test_dataframe_repr_html(df) -> None: headers = ["a", "b", "c"] headers = [f"{v}" for v in headers] header_pattern = "(.*?)".join(headers) - assert len(re.findall(header_pattern, output, re.DOTALL)) == 1 + header_matches = re.findall(header_pattern, output, re.DOTALL) + assert len(header_matches) == 1 + # Update the pattern to handle values that may be wrapped in spans body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]] - body_lines = [f"{v}" for inner in body_data for v in inner] + + body_lines = [ + f"(?:]*?>)?{v}(?:)?" + for inner in body_data + for v in inner + ] body_pattern = "(.*?)".join(body_lines) - assert len(re.findall(body_pattern, output, re.DOTALL)) == 1 + + body_matches = re.findall(body_pattern, output, re.DOTALL) + + assert len(body_matches) == 1, "Expected pattern of values not found in HTML output" + + +def test_dataframe_repr_html_values(df): + """Test that DataFrame._repr_html_ contains the expected data values.""" + html = df._repr_html_() + assert html is not None + + # Create a more flexible pattern that handles values being wrapped in spans + # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless + # of formatting + pattern = re.compile( + r"]*?>(?:]*?>)?1(?:)?.*?" + r"]*?>(?:]*?>)?4(?:)?.*?" + r"]*?>(?:]*?>)?8(?:)?.*?" + r"]*?>(?:]*?>)?2(?:)?.*?" + r"]*?>(?:]*?>)?5(?:)?.*?" + r"]*?>(?:]*?>)?5(?:)?.*?" + r"]*?>(?:]*?>)?3(?:)?.*?" + r"]*?>(?:]*?>)?6(?:)?.*?" + r"]*?>(?:]*?>)?8(?:)?", + re.DOTALL, + ) + + # Print debug info if the test fails + matches = re.findall(pattern, html) + if not matches: + print(f"HTML output snippet: {html[:500]}...") # noqa: T201 + + assert len(matches) > 0, "Expected pattern of values not found in HTML output" + + +def test_html_formatter_shared_styles(df, clean_formatter_state): + """Test that shared styles work correctly across multiple tables.""" + + # First, ensure we're using shared styles + configure_formatter(use_shared_styles=True) + + # Get HTML output for first table - should include styles + html_first = df._repr_html_() + + # Verify styles are included in first render + assert " + // Convert record batches to PyObject list + let py_batches = batches + .into_iter() + .map(|rb| rb.to_pyarrow(py)) + .collect::>>()?; -
-
" + f"{field.name}
" + f"
" + "" + "" + f"{formatted_value}" + f"" + f"
" + f"
{formatted_value}{value}-high{value}-low{value}-mid]*>(\d+)-low]*>(\d+)-mid]*>(\d+)-high1-low2-low3-mid4-mid6-high8-high{value}{value}{value}{field.name}
- \n".to_string(); + let py_schema = self.schema().into_pyobject(py)?; - let schema = batches[0].schema(); + // Get the Python formatter module and call format_html + let formatter_module = py.import("datafusion.html_formatter")?; + let get_formatter = formatter_module.getattr("get_formatter")?; + let formatter = get_formatter.call0()?; - let mut header = Vec::new(); - for field in schema.fields() { - header.push(format!("", field.name())); - } - let header_str = header.join(""); - html_str.push_str(&format!("{}\n", header_str)); - - let batch_formatters = batches - .iter() - .map(|batch| { - batch - .columns() - .iter() - .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) - .map(|c| { - c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) - }) - .collect::, _>>() - }) - .collect::, _>>()?; - - let rows_per_batch = batches.iter().map(|batch| batch.num_rows()); - - // We need to build up row by row for html - let mut table_row = 0; - for (batch_formatter, num_rows_in_batch) in batch_formatters.iter().zip(rows_per_batch) { - for batch_row in 0..num_rows_in_batch { - table_row += 1; - let mut cells = Vec::new(); - for (col, formatter) in batch_formatter.iter().enumerate() { - let cell_data = formatter.value(batch_row).to_string(); - // From testing, primitive data types do not typically get larger than 21 characters - if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE { - let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE]; - cells.push(format!(" - ")); - } else { - cells.push(format!("", formatter.value(batch_row))); - } - } - let row_str = cells.join(""); - html_str.push_str(&format!("{}\n", row_str)); - } - } - html_str.push_str("
{}
-
- {short_cell_data} - {cell_data} - -
-
{}
\n"); - - html_str.push_str(" - - "); + // Call format_html method on the formatter + let kwargs = pyo3::types::PyDict::new(py); + let py_batches_list = PyList::new(py, py_batches.as_slice())?; + kwargs.set_item("batches", py_batches_list)?; + kwargs.set_item("schema", py_schema)?; + kwargs.set_item("has_more", has_more)?; + kwargs.set_item("table_uuid", table_uuid)?; - if has_more { - html_str.push_str("Data truncated due to size."); - } + let html_result = formatter.call_method("format_html", (), Some(&kwargs))?; + let html_str: String = html_result.extract()?; Ok(html_str) } @@ -835,7 +747,7 @@ fn record_batch_into_schema( ) -> Result { let schema = Arc::new(schema.clone()); let base_schema = record_batch.schema(); - if base_schema.fields().len() == 0 { + if base_schema.fields().is_empty() { // Nothing to project return Ok(RecordBatch::new_empty(schema)); }