diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
index ecf5545bc..60d0d61b4 100644
--- a/python/datafusion/__init__.py
+++ b/python/datafusion/__init__.py
@@ -45,6 +45,7 @@
Expr,
WindowFrame,
)
+from .html_formatter import configure_formatter
from .io import read_avro, read_csv, read_json, read_parquet
from .plan import ExecutionPlan, LogicalPlan
from .record_batch import RecordBatch, RecordBatchStream
@@ -76,6 +77,7 @@
"col",
"column",
"common",
+ "configure_formatter",
"expr",
"functions",
"lit",
diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py
new file mode 100644
index 000000000..a50e14fd5
--- /dev/null
+++ b/python/datafusion/html_formatter.py
@@ -0,0 +1,647 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""HTML formatting utilities for DataFusion DataFrames."""
+
+from __future__ import annotations
+
+from typing import (
+ Any,
+ Callable,
+ Optional,
+ Protocol,
+ runtime_checkable,
+)
+
+
+@runtime_checkable
+class CellFormatter(Protocol):
+ """Protocol for cell value formatters."""
+
+ def __call__(self, value: Any) -> str:
+ """Format a cell value to string representation."""
+ ...
+
+
+@runtime_checkable
+class StyleProvider(Protocol):
+ """Protocol for HTML style providers."""
+
+ def get_cell_style(self) -> str:
+ """Get the CSS style for table cells."""
+ ...
+
+ def get_header_style(self) -> str:
+ """Get the CSS style for header cells."""
+ ...
+
+
+class DefaultStyleProvider:
+ """Default implementation of StyleProvider."""
+
+ def get_cell_style(self) -> str:
+ """Get the CSS style for table cells.
+
+ Returns:
+ CSS style string
+ """
+ return (
+ "border: 1px solid black; padding: 8px; text-align: left; "
+ "white-space: nowrap;"
+ )
+
+ def get_header_style(self) -> str:
+ """Get the CSS style for header cells.
+
+ Returns:
+ CSS style string
+ """
+ return (
+ "border: 1px solid black; padding: 8px; text-align: left; "
+ "background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; "
+ "max-width: fit-content;"
+ )
+
+
+class DataFrameHtmlFormatter:
+ """Configurable HTML formatter for DataFusion DataFrames.
+
+ This class handles the HTML rendering of DataFrames for display in
+ Jupyter notebooks and other rich display contexts.
+
+ This class supports extension through composition. Key extension points:
+ - Provide a custom StyleProvider for styling cells and headers
+ - Register custom formatters for specific types
+ - Provide custom cell builders for specialized cell rendering
+
+ Args:
+ max_cell_length: Maximum characters to display in a cell before truncation
+ max_width: Maximum width of the HTML table in pixels
+ max_height: Maximum height of the HTML table in pixels
+ enable_cell_expansion: Whether to add expand/collapse buttons for long cell
+ values
+ custom_css: Additional CSS to include in the HTML output
+ show_truncation_message: Whether to display a message when data is truncated
+ style_provider: Custom provider for cell and header styles
+ use_shared_styles: Whether to load styles and scripts only once per notebook
+ session
+ """
+
+ # Class variable to track if styles have been loaded in the notebook
+ _styles_loaded = False
+
+ def __init__(
+ self,
+ max_cell_length: int = 25,
+ max_width: int = 1000,
+ max_height: int = 300,
+ enable_cell_expansion: bool = True,
+ custom_css: Optional[str] = None,
+ show_truncation_message: bool = True,
+ style_provider: Optional[StyleProvider] = None,
+ use_shared_styles: bool = True,
+ ) -> None:
+ """Initialize the HTML formatter.
+
+ Parameters
+ ----------
+ max_cell_length : int, default 25
+ Maximum length of cell content before truncation.
+ max_width : int, default 1000
+ Maximum width of the displayed table in pixels.
+ max_height : int, default 300
+ Maximum height of the displayed table in pixels.
+ enable_cell_expansion : bool, default True
+ Whether to allow cells to expand when clicked.
+ custom_css : str, optional
+ Custom CSS to apply to the HTML table.
+ show_truncation_message : bool, default True
+ Whether to show a message indicating that content has been truncated.
+ style_provider : StyleProvider, optional
+ Provider of CSS styles for the HTML table. If None, DefaultStyleProvider
+ is used.
+ use_shared_styles : bool, default True
+ Whether to use shared styles across multiple tables.
+
+ Raises:
+ ------
+ ValueError
+ If max_cell_length, max_width, or max_height is not a positive integer.
+ TypeError
+ If enable_cell_expansion, show_truncation_message, or use_shared_styles is
+ not a boolean,
+ or if custom_css is provided but is not a string,
+ or if style_provider is provided but does not implement the StyleProvider
+ protocol.
+ """
+ # Validate numeric parameters
+
+ if not isinstance(max_cell_length, int) or max_cell_length <= 0:
+ msg = "max_cell_length must be a positive integer"
+ raise ValueError(msg)
+ if not isinstance(max_width, int) or max_width <= 0:
+ msg = "max_width must be a positive integer"
+ raise ValueError(msg)
+ if not isinstance(max_height, int) or max_height <= 0:
+ msg = "max_height must be a positive integer"
+ raise ValueError(msg)
+
+ # Validate boolean parameters
+ if not isinstance(enable_cell_expansion, bool):
+ msg = "enable_cell_expansion must be a boolean"
+ raise TypeError(msg)
+ if not isinstance(show_truncation_message, bool):
+ msg = "show_truncation_message must be a boolean"
+ raise TypeError(msg)
+ if not isinstance(use_shared_styles, bool):
+ msg = "use_shared_styles must be a boolean"
+ raise TypeError(msg)
+
+ # Validate custom_css
+ if custom_css is not None and not isinstance(custom_css, str):
+ msg = "custom_css must be None or a string"
+ raise TypeError(msg)
+
+ # Validate style_provider
+ if style_provider is not None and not isinstance(style_provider, StyleProvider):
+ msg = "style_provider must implement the StyleProvider protocol"
+ raise TypeError(msg)
+
+ self.max_cell_length = max_cell_length
+ self.max_width = max_width
+ self.max_height = max_height
+ self.enable_cell_expansion = enable_cell_expansion
+ self.custom_css = custom_css
+ self.show_truncation_message = show_truncation_message
+ self.style_provider = style_provider or DefaultStyleProvider()
+ self.use_shared_styles = use_shared_styles
+ # Registry for custom type formatters
+ self._type_formatters: dict[type, CellFormatter] = {}
+ # Custom cell builders
+ self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None
+ self._custom_header_builder: Optional[Callable[[Any], str]] = None
+
+ def register_formatter(self, type_class: type, formatter: CellFormatter) -> None:
+ """Register a custom formatter for a specific data type.
+
+ Args:
+ type_class: The type to register a formatter for
+ formatter: Function that takes a value of the given type and returns
+ a formatted string
+ """
+ self._type_formatters[type_class] = formatter
+
+ def set_custom_cell_builder(
+ self, builder: Callable[[Any, int, int, str], str]
+ ) -> None:
+ """Set a custom cell builder function.
+
+ Args:
+ builder: Function that takes (value, row, col, table_id) and returns HTML
+ """
+ self._custom_cell_builder = builder
+
+ def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None:
+ """Set a custom header builder function.
+
+ Args:
+ builder: Function that takes a field and returns HTML
+ """
+ self._custom_header_builder = builder
+
+ @classmethod
+ def is_styles_loaded(cls) -> bool:
+ """Check if HTML styles have been loaded in the current session.
+
+ This method is primarily intended for debugging UI rendering issues
+ related to style loading.
+
+ Returns:
+ True if styles have been loaded, False otherwise
+
+ Example:
+ >>> from datafusion.html_formatter import DataFrameHtmlFormatter
+ >>> DataFrameHtmlFormatter.is_styles_loaded()
+ False
+ """
+ return cls._styles_loaded
+
+ def format_html(
+ self,
+ batches: list,
+ schema: Any,
+ has_more: bool = False,
+ table_uuid: str | None = None,
+ ) -> str:
+ """Format record batches as HTML.
+
+ This method is used by DataFrame's _repr_html_ implementation and can be
+ called directly when custom HTML rendering is needed.
+
+ Args:
+ batches: List of Arrow RecordBatch objects
+ schema: Arrow Schema object
+ has_more: Whether there are more batches not shown
+ table_uuid: Unique ID for the table, used for JavaScript interactions
+
+ Returns:
+ HTML string representation of the data
+
+ Raises:
+ TypeError: If schema is invalid and no batches are provided
+ """
+ if not batches:
+ return "No data to display"
+
+ # Validate schema
+ if schema is None or not hasattr(schema, "__iter__"):
+ msg = "Schema must be provided"
+ raise TypeError(msg)
+
+ # Generate a unique ID if none provided
+ table_uuid = table_uuid or f"df-{id(batches)}"
+
+ # Build HTML components
+ html = []
+
+ # Only include styles and scripts if:
+ # 1. Not using shared styles, OR
+ # 2. Using shared styles but they haven't been loaded yet
+ include_styles = (
+ not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded
+ )
+
+ if include_styles:
+ html.extend(self._build_html_header())
+ # If we're using shared styles, mark them as loaded
+ if self.use_shared_styles:
+ DataFrameHtmlFormatter._styles_loaded = True
+
+ html.extend(self._build_table_container_start())
+
+ # Add table header and body
+ html.extend(self._build_table_header(schema))
+ html.extend(self._build_table_body(batches, table_uuid))
+
+ html.append("")
+ html.append("")
+
+ # Add footer (JavaScript and messages)
+ if include_styles and self.enable_cell_expansion:
+ html.append(self._get_javascript())
+
+ # Always add truncation message if needed (independent of styles)
+ if has_more and self.show_truncation_message:
+ html.append("
Data truncated due to size.
")
+
+ return "\n".join(html)
+
+ def _build_html_header(self) -> list[str]:
+ """Build the HTML header with CSS styles."""
+ html = []
+ html.append("")
+ return html
+
+ def _build_table_container_start(self) -> list[str]:
+ """Build the opening tags for the table container."""
+ html = []
+ html.append(
+ f'
'
+ )
+ html.append('
')
+ return html
+
+ def _build_table_header(self, schema: Any) -> list[str]:
+ """Build the HTML table header with column names."""
+ html = []
+ html.append("")
+ html.append("
")
+ for field in schema:
+ if self._custom_header_builder:
+ html.append(self._custom_header_builder(field))
+ else:
+ html.append(
+ f"
"
+ f"{field.name}
"
+ )
+ html.append("
")
+ html.append("")
+ return html
+
+ def _build_table_body(self, batches: list, table_uuid: str) -> list[str]:
+ """Build the HTML table body with data rows."""
+ html = []
+ html.append("")
+
+ row_count = 0
+ for batch in batches:
+ for row_idx in range(batch.num_rows):
+ row_count += 1
+ html.append("
")
+
+ for col_idx, column in enumerate(batch.columns):
+ # Get the raw value from the column
+ raw_value = self._get_cell_value(column, row_idx)
+
+ # Always check for type formatters first to format the value
+ formatted_value = self._format_cell_value(raw_value)
+
+ # Then apply either custom cell builder or standard cell formatting
+ if self._custom_cell_builder:
+ # Pass both the raw value and formatted value to let the
+ # builder decide
+ cell_html = self._custom_cell_builder(
+ raw_value, row_count, col_idx, table_uuid
+ )
+ html.append(cell_html)
+ else:
+ # Standard cell formatting with formatted value
+ if (
+ len(str(raw_value)) > self.max_cell_length
+ and self.enable_cell_expansion
+ ):
+ cell_html = self._build_expandable_cell(
+ formatted_value, row_count, col_idx, table_uuid
+ )
+ else:
+ cell_html = self._build_regular_cell(formatted_value)
+ html.append(cell_html)
+
+ html.append("
")
+
+ html.append("")
+ return html
+
+ def _get_cell_value(self, column: Any, row_idx: int) -> Any:
+ """Extract a cell value from a column.
+
+ Args:
+ column: Arrow array
+ row_idx: Row index
+
+ Returns:
+ The raw cell value
+ """
+ try:
+ value = column[row_idx]
+
+ if hasattr(value, "as_py"):
+ return value.as_py()
+ except (AttributeError, TypeError):
+ pass
+ else:
+ return value
+
+ def _format_cell_value(self, value: Any) -> str:
+ """Format a cell value for display.
+
+ Uses registered type formatters if available.
+
+ Args:
+ value: The cell value to format
+
+ Returns:
+ Formatted cell value as string
+ """
+ # Check for custom type formatters
+ for type_cls, formatter in self._type_formatters.items():
+ if isinstance(value, type_cls):
+ return formatter(value)
+
+ # If no formatter matched, return string representation
+ return str(value)
+
+ def _build_expandable_cell(
+ self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str
+ ) -> str:
+ """Build an expandable cell for long content."""
+ short_value = str(formatted_value)[: self.max_cell_length]
+ return (
+ f"
"
+ )
+
+ def _build_html_footer(self, has_more: bool) -> list[str]:
+ """Build the HTML footer with JavaScript and messages."""
+ html = []
+
+ # Add JavaScript for interactivity only if cell expansion is enabled
+ # and we're not using the shared styles approach
+ if self.enable_cell_expansion and not self.use_shared_styles:
+ html.append(self._get_javascript())
+
+ # Add truncation message if needed
+ if has_more and self.show_truncation_message:
+ html.append("
Data truncated due to size.
")
+
+ return html
+
+ def _get_default_css(self) -> str:
+ """Get default CSS styles for the HTML table."""
+ return """
+ .expandable-container {
+ display: inline-block;
+ max-width: 200px;
+ }
+ .expandable {
+ white-space: nowrap;
+ overflow: hidden;
+ text-overflow: ellipsis;
+ display: block;
+ }
+ .full-text {
+ display: none;
+ white-space: normal;
+ }
+ .expand-btn {
+ cursor: pointer;
+ color: blue;
+ text-decoration: underline;
+ border: none;
+ background: none;
+ font-size: inherit;
+ display: block;
+ margin-top: 5px;
+ }
+ """
+
+ def _get_javascript(self) -> str:
+ """Get JavaScript code for interactive elements."""
+ return """
+
+ """
+
+
+class FormatterManager:
+ """Manager class for the global DataFrame HTML formatter instance."""
+
+ _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter()
+
+ @classmethod
+ def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None:
+ """Set the global DataFrame HTML formatter.
+
+ Args:
+ formatter: The formatter instance to use globally
+ """
+ cls._default_formatter = formatter
+ _refresh_formatter_reference()
+
+ @classmethod
+ def get_formatter(cls) -> DataFrameHtmlFormatter:
+ """Get the current global DataFrame HTML formatter.
+
+ Returns:
+ The global HTML formatter instance
+ """
+ return cls._default_formatter
+
+
+def get_formatter() -> DataFrameHtmlFormatter:
+ """Get the current global DataFrame HTML formatter.
+
+ This function is used by the DataFrame._repr_html_ implementation to access
+ the shared formatter instance. It can also be used directly when custom
+ HTML rendering is needed.
+
+ Returns:
+ The global HTML formatter instance
+
+ Example:
+ >>> from datafusion.html_formatter import get_formatter
+ >>> formatter = get_formatter()
+ >>> formatter.max_cell_length = 50 # Increase cell length
+ """
+ return FormatterManager.get_formatter()
+
+
+def set_formatter(formatter: DataFrameHtmlFormatter) -> None:
+ """Set the global DataFrame HTML formatter.
+
+ Args:
+ formatter: The formatter instance to use globally
+
+ Example:
+ >>> from datafusion.html_formatter import get_formatter, set_formatter
+ >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100)
+ >>> set_formatter(custom_formatter)
+ """
+ FormatterManager.set_formatter(formatter)
+
+
+def configure_formatter(**kwargs: Any) -> None:
+ """Configure the global DataFrame HTML formatter.
+
+ This function creates a new formatter with the provided configuration
+ and sets it as the global formatter for all DataFrames.
+
+ Args:
+ **kwargs: Formatter configuration parameters like max_cell_length,
+ max_width, max_height, enable_cell_expansion, etc.
+
+ Example:
+ >>> from datafusion.html_formatter import configure_formatter
+ >>> configure_formatter(
+ ... max_cell_length=50,
+ ... max_height=500,
+ ... enable_cell_expansion=True,
+ ... use_shared_styles=True
+ ... )
+ """
+ set_formatter(DataFrameHtmlFormatter(**kwargs))
+
+
+def reset_formatter() -> None:
+ """Reset the global DataFrame HTML formatter to default settings.
+
+ This function creates a new formatter with default configuration
+ and sets it as the global formatter for all DataFrames.
+
+ Example:
+ >>> from datafusion.html_formatter import reset_formatter
+ >>> reset_formatter() # Reset formatter to default settings
+ """
+ formatter = DataFrameHtmlFormatter()
+ # Reset the styles_loaded flag to ensure styles will be reloaded
+ DataFrameHtmlFormatter._styles_loaded = False
+ set_formatter(formatter)
+
+
+def reset_styles_loaded_state() -> None:
+ """Reset the styles loaded state to force reloading of styles.
+
+ This can be useful when switching between notebook sessions or
+ when styles need to be refreshed.
+
+ Example:
+ >>> from datafusion.html_formatter import reset_styles_loaded_state
+ >>> reset_styles_loaded_state() # Force styles to reload in next render
+ """
+ DataFrameHtmlFormatter._styles_loaded = False
+
+
+def _refresh_formatter_reference() -> None:
+ """Refresh formatter reference in any modules using it.
+
+ This helps ensure that changes to the formatter are reflected in existing
+ DataFrames that might be caching the formatter reference.
+ """
+ # This is a no-op but signals modules to refresh their reference
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index eda13930d..464b884db 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -28,8 +28,17 @@
column,
literal,
)
-from datafusion import functions as f
+from datafusion import (
+ functions as f,
+)
from datafusion.expr import Window
+from datafusion.html_formatter import (
+ DataFrameHtmlFormatter,
+ configure_formatter,
+ get_formatter,
+ reset_formatter,
+ reset_styles_loaded_state,
+)
from pyarrow.csv import write_csv
@@ -102,6 +111,12 @@ def partitioned_df():
return ctx.create_dataframe([[batch]])
+@pytest.fixture
+def clean_formatter_state():
+ """Reset the HTML formatter after each test."""
+ reset_formatter()
+
+
def test_select(df):
df_1 = df.select(
column("a") + column("b"),
@@ -656,6 +671,252 @@ def test_window_frame_defaults_match_postgres(partitioned_df):
assert df_2.sort(col_a).to_pydict() == expected
+def test_html_formatter_configuration(df, clean_formatter_state):
+ """Test configuring the HTML formatter with different options."""
+ # Configure with custom settings
+ configure_formatter(
+ max_cell_length=5,
+ max_width=500,
+ max_height=200,
+ enable_cell_expansion=False,
+ )
+
+ html_output = df._repr_html_()
+
+ # Verify our configuration was applied
+ assert "max-height: 200px" in html_output
+ assert "max-width: 500px" in html_output
+ # With cell expansion disabled, we shouldn't see expandable-container elements
+ assert "expandable-container" not in html_output
+
+
+def test_html_formatter_custom_style_provider(df, clean_formatter_state):
+ """Test using custom style providers with the HTML formatter."""
+
+ class CustomStyleProvider:
+ def get_cell_style(self) -> str:
+ return (
+ "background-color: #f5f5f5; color: #333; padding: 8px; border: "
+ "1px solid #ddd;"
+ )
+
+ def get_header_style(self) -> str:
+ return (
+ "background-color: #4285f4; color: white; font-weight: bold; "
+ "padding: 10px; border: 1px solid #3367d6;"
+ )
+
+ # Configure with custom style provider
+ configure_formatter(style_provider=CustomStyleProvider())
+
+ html_output = df._repr_html_()
+
+ # Verify our custom styles were applied
+ assert "background-color: #4285f4" in html_output
+ assert "color: white" in html_output
+ assert "background-color: #f5f5f5" in html_output
+
+
+def test_html_formatter_type_formatters(df, clean_formatter_state):
+ """Test registering custom type formatters for specific data types."""
+
+ # Get current formatter and register custom formatters
+ formatter = get_formatter()
+
+ # Format integers with color based on value
+ # Using int as the type for the formatter will work since we convert
+ # Arrow scalar values to Python native types in _get_cell_value
+ def format_int(value):
+ return f' 2 else "blue"}">{value}'
+
+ formatter.register_formatter(int, format_int)
+
+ html_output = df._repr_html_()
+
+ # Our test dataframe has values 1,2,3 so we should see:
+ assert '1' in html_output
+
+
+def test_html_formatter_custom_cell_builder(df, clean_formatter_state):
+ """Test using a custom cell builder function."""
+
+ # Create a custom cell builder with distinct styling for different value ranges
+ def custom_cell_builder(value, row, col, table_id):
+ try:
+ num_value = int(value)
+ if num_value > 5: # Values > 5 get green background with indicator
+ return (
+ '
{value}-high
'
+ )
+ if num_value < 3: # Values < 3 get blue background with indicator
+ return (
+ '
{value}-low
'
+ )
+ except (ValueError, TypeError):
+ pass
+
+ # Default styling for other cells (3, 4, 5)
+ return f'
{value}-mid
'
+
+ # Set our custom cell builder
+ formatter = get_formatter()
+ formatter.set_custom_cell_builder(custom_cell_builder)
+
+ html_output = df._repr_html_()
+
+ # Extract cells with specific styling using regex
+ low_cells = re.findall(
+ r'
'
+ )
+
+ # Set our custom header builder
+ formatter = get_formatter()
+ formatter.set_custom_header_builder(custom_header_builder)
+
+ html_output = df._repr_html_()
+
+ # Verify our custom headers were applied
+ assert 'title="Primary key column"' in html_output
+ assert 'title="Secondary values"' in html_output
+ assert "background-color: #333; color: white" in html_output
+
+
+def test_html_formatter_complex_customization(df, clean_formatter_state):
+ """Test combining multiple customization options together."""
+
+ # Create a dark mode style provider
+ class DarkModeStyleProvider:
+ def get_cell_style(self) -> str:
+ return (
+ "background-color: #222; color: #eee; "
+ "padding: 8px; border: 1px solid #444;"
+ )
+
+ def get_header_style(self) -> str:
+ return (
+ "background-color: #111; color: #fff; padding: 10px; "
+ "border: 1px solid #333;"
+ )
+
+ # Configure with dark mode style
+ configure_formatter(
+ max_cell_length=10,
+ style_provider=DarkModeStyleProvider(),
+ custom_css="""
+ .datafusion-table {
+ font-family: monospace;
+ border-collapse: collapse;
+ }
+ .datafusion-table tr:hover td {
+ background-color: #444 !important;
+ }
+ """,
+ )
+
+ # Add type formatters for special formatting - now working with native int values
+ formatter = get_formatter()
+ formatter.register_formatter(
+ int,
+ lambda n: f'{n}',
+ )
+
+ html_output = df._repr_html_()
+
+ # Verify our customizations were applied
+ assert "background-color: #222" in html_output
+ assert "background-color: #111" in html_output
+ assert ".datafusion-table" in html_output
+ assert "color: #5af" in html_output # Even numbers
+
+
def test_get_dataframe(tmp_path):
ctx = SessionContext()
@@ -1244,7 +1505,10 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame:
assert result["new_col"] == [3 for _i in range(3)]
-def test_dataframe_repr_html(df) -> None:
+def test_dataframe_repr_html_structure(df) -> None:
+ """Test that DataFrame._repr_html_ produces expected HTML output structure."""
+ import re
+
output = df._repr_html_()
# Since we've added a fair bit of processing to the html output, lets just verify
@@ -1255,9 +1519,131 @@ def test_dataframe_repr_html(df) -> None:
headers = ["a", "b", "c"]
headers = [f"
{v}
" for v in headers]
header_pattern = "(.*?)".join(headers)
- assert len(re.findall(header_pattern, output, re.DOTALL)) == 1
+ header_matches = re.findall(header_pattern, output, re.DOTALL)
+ assert len(header_matches) == 1
+ # Update the pattern to handle values that may be wrapped in spans
body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]]
- body_lines = [f"
{v}
" for inner in body_data for v in inner]
+
+ body_lines = [
+ f"
(?:]*?>)?{v}(?:)?
"
+ for inner in body_data
+ for v in inner
+ ]
body_pattern = "(.*?)".join(body_lines)
- assert len(re.findall(body_pattern, output, re.DOTALL)) == 1
+
+ body_matches = re.findall(body_pattern, output, re.DOTALL)
+
+ assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
+
+
+def test_dataframe_repr_html_values(df):
+ """Test that DataFrame._repr_html_ contains the expected data values."""
+ html = df._repr_html_()
+ assert html is not None
+
+ # Create a more flexible pattern that handles values being wrapped in spans
+ # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
+ # of formatting
+ pattern = re.compile(
+ r"
]*?>(?:]*?>)?1(?:)?
.*?"
+ r"
]*?>(?:]*?>)?4(?:)?
.*?"
+ r"
]*?>(?:]*?>)?8(?:)?
.*?"
+ r"
]*?>(?:]*?>)?2(?:)?
.*?"
+ r"
]*?>(?:]*?>)?5(?:)?
.*?"
+ r"
]*?>(?:]*?>)?5(?:)?
.*?"
+ r"
]*?>(?:]*?>)?3(?:)?
.*?"
+ r"
]*?>(?:]*?>)?6(?:)?
.*?"
+ r"
]*?>(?:]*?>)?8(?:)?
",
+ re.DOTALL,
+ )
+
+ # Print debug info if the test fails
+ matches = re.findall(pattern, html)
+ if not matches:
+ print(f"HTML output snippet: {html[:500]}...") # noqa: T201
+
+ assert len(matches) > 0, "Expected pattern of values not found in HTML output"
+
+
+def test_html_formatter_shared_styles(df, clean_formatter_state):
+ """Test that shared styles work correctly across multiple tables."""
+
+ # First, ensure we're using shared styles
+ configure_formatter(use_shared_styles=True)
+
+ # Get HTML output for first table - should include styles
+ html_first = df._repr_html_()
+
+ # Verify styles are included in first render
+ assert "
+ // Convert record batches to PyObject list
+ let py_batches = batches
+ .into_iter()
+ .map(|rb| rb.to_pyarrow(py))
+ .collect::>>()?;
-
-
- \n".to_string();
+ let py_schema = self.schema().into_pyobject(py)?;
- let schema = batches[0].schema();
+ // Get the Python formatter module and call format_html
+ let formatter_module = py.import("datafusion.html_formatter")?;
+ let get_formatter = formatter_module.getattr("get_formatter")?;
+ let formatter = get_formatter.call0()?;
- let mut header = Vec::new();
- for field in schema.fields() {
- header.push(format!("
\n", header_str));
-
- let batch_formatters = batches
- .iter()
- .map(|batch| {
- batch
- .columns()
- .iter()
- .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default()))
- .map(|c| {
- c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string())))
- })
- .collect::, _>>()
- })
- .collect::, _>>()?;
-
- let rows_per_batch = batches.iter().map(|batch| batch.num_rows());
-
- // We need to build up row by row for html
- let mut table_row = 0;
- for (batch_formatter, num_rows_in_batch) in batch_formatters.iter().zip(rows_per_batch) {
- for batch_row in 0..num_rows_in_batch {
- table_row += 1;
- let mut cells = Vec::new();
- for (col, formatter) in batch_formatter.iter().enumerate() {
- let cell_data = formatter.value(batch_row).to_string();
- // From testing, primitive data types do not typically get larger than 21 characters
- if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE {
- let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE];
- cells.push(format!("
-