|
15 | 15 | .. specific language governing permissions and limitations |
16 | 16 | .. under the License. |
17 | 17 |
|
18 | | -DataFrame Operations |
19 | | -=================== |
| 18 | +DataFrames |
| 19 | +========== |
20 | 20 |
|
21 | | -Working with DataFrames |
22 | | ----------------------- |
| 21 | +Overview |
| 22 | +-------- |
23 | 23 |
|
24 | | -A DataFrame in DataFusion represents a logical plan that defines a series of operations to be performed on data. |
25 | | -This logical plan is not executed until you call a terminal operation like :py:func:`~datafusion.dataframe.DataFrame.collect` |
26 | | -or :py:func:`~datafusion.dataframe.DataFrame.show`. |
| 24 | +DataFusion's DataFrame API provides a powerful interface for building and executing queries against data sources. |
| 25 | +It offers a familiar API similar to pandas and other DataFrame libraries, but with the performance benefits of Rust |
| 26 | +and Arrow. |
27 | 27 |
|
28 | | -DataFrames provide a familiar API for data manipulation: |
| 28 | +A DataFrame represents a logical plan that can be composed through operations like filtering, projection, and aggregation. |
| 29 | +The actual execution happens when terminal operations like `collect()` or `show()` are called. |
29 | 30 |
|
30 | | -.. ipython:: python |
| 31 | +Basic Usage |
| 32 | +---------- |
31 | 33 |
|
32 | | - import datafusion |
33 | | - from datafusion import col, lit, functions as f |
34 | | - |
35 | | - ctx = datafusion.SessionContext() |
36 | | - |
37 | | - # Create a DataFrame from a CSV file |
38 | | - df = ctx.read_csv("example.csv") |
39 | | - |
40 | | - # Add transformations |
41 | | - df = df.filter(col("age") > lit(30)) \ |
42 | | - .select([col("name"), col("age"), (col("salary") * lit(1.1)).alias("new_salary")]) \ |
43 | | - .sort("age") |
44 | | - |
45 | | - # Execute the plan |
46 | | - df.show() |
47 | | -
|
48 | | -Common DataFrame Operations |
49 | | --------------------------- |
50 | | - |
51 | | -DataFusion supports a wide range of operations on DataFrames: |
52 | | - |
53 | | -Filtering and Selection |
54 | | -~~~~~~~~~~~~~~~~~~~~~~~ |
| 34 | +.. code-block:: python |
55 | 35 |
|
56 | | -.. ipython:: python |
| 36 | + import datafusion |
| 37 | + from datafusion import col, lit |
57 | 38 |
|
58 | | - # Filter rows |
59 | | - df = df.filter(col("age") > lit(30)) |
| 39 | + # Create a context and register a data source |
| 40 | + ctx = datafusion.SessionContext() |
| 41 | + ctx.register_csv("my_table", "path/to/data.csv") |
60 | 42 | |
61 | | - # Select columns |
62 | | - df = df.select([col("name"), col("age")]) |
| 43 | + # Create and manipulate a DataFrame |
| 44 | + df = ctx.sql("SELECT * FROM my_table") |
63 | 45 | |
64 | | - # Select by column name |
65 | | - df = df.select_columns(["name", "age"]) |
| 46 | + # Or use the DataFrame API directly |
| 47 | + df = (ctx.table("my_table") |
| 48 | + .filter(col("age") > lit(25)) |
| 49 | + .select([col("name"), col("age")])) |
66 | 50 | |
67 | | - # Select using column indexing |
68 | | - df = df["name", "age"] |
69 | | -
|
70 | | -Aggregation |
71 | | -~~~~~~~~~~ |
72 | | - |
73 | | -.. ipython:: python |
74 | | -
|
75 | | - # Group by and aggregate |
76 | | - df = df.aggregate( |
77 | | - [col("category")], # Group by columns |
78 | | - [f.sum(col("amount")).alias("total"), |
79 | | - f.avg(col("price")).alias("avg_price")] |
80 | | - ) |
81 | | -
|
82 | | -Joins |
83 | | -~~~~~ |
84 | | - |
85 | | -.. ipython:: python |
86 | | -
|
87 | | - # Join two DataFrames |
88 | | - df_joined = df1.join( |
89 | | - df2, |
90 | | - how="inner", |
91 | | - left_on=["id"], |
92 | | - right_on=["id"] |
93 | | - ) |
| 51 | + # Execute and collect results |
| 52 | + result = df.collect() |
94 | 53 | |
95 | | - # Join with custom expressions |
96 | | - df_joined = df1.join_on( |
97 | | - df2, |
98 | | - [col("df1.id") == col("df2.id")], |
99 | | - how="left" |
100 | | - ) |
101 | | -
|
102 | | -DataFrame Visualization |
103 | | ----------------------- |
104 | | - |
105 | | -Jupyter Notebook Integration |
106 | | -~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 54 | + # Display the first few rows |
| 55 | + df.show() |
107 | 56 |
|
108 | | -When working in Jupyter notebooks, DataFrames automatically display as HTML tables. This is |
109 | | -handled by the :code:`_repr_html_` method, which provides a rich, formatted view of your data. |
| 57 | +HTML Rendering |
| 58 | +------------- |
110 | 59 |
|
111 | | -.. ipython:: python |
| 60 | +When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will |
| 61 | +automatically display as formatted HTML tables, making it easier to visualize your data. |
112 | 62 |
|
113 | | - # DataFrames render as HTML tables in notebooks |
114 | | - df # Just displaying the DataFrame renders it as HTML |
| 63 | +The `_repr_html_` method is called automatically by Jupyter to render a DataFrame. This method |
| 64 | +controls how DataFrames appear in notebook environments, providing a richer visualization than |
| 65 | +plain text output. |
115 | 66 |
|
116 | | -Customizing DataFrame Display |
117 | | -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 67 | +Customizing HTML Rendering |
| 68 | +------------------------- |
118 | 69 |
|
119 | | -You can customize how DataFrames are displayed using the HTML formatter: |
| 70 | +You can customize how DataFrames are rendered in HTML by configuring the formatter: |
120 | 71 |
|
121 | | -.. ipython:: python |
| 72 | +.. code-block:: python |
122 | 73 |
|
123 | 74 | from datafusion.html_formatter import configure_formatter |
124 | 75 | |
125 | | - # Change display settings |
| 76 | + # Change the default styling |
126 | 77 | configure_formatter( |
127 | | - max_rows=100, # Show more rows |
128 | | - truncate_width=30, # Allow longer strings |
129 | | - theme="light", # Use light theme |
130 | | - precision=2 # Set decimal precision |
| 78 | + max_rows=50, # Maximum number of rows to display |
| 79 | + max_width=None, # Maximum width in pixels (None for auto) |
| 80 | + theme="light", # Theme: "light" or "dark" |
| 81 | + precision=2, # Floating point precision |
| 82 | + thousands_separator=",", # Separator for thousands |
| 83 | + date_format="%Y-%m-%d", # Date format |
| 84 | + truncate_width=20 # Max width for string columns before truncating |
131 | 85 | ) |
132 | | - |
133 | | - # Now display uses the new format |
134 | | - df.show() |
135 | 86 |
|
136 | | -Creating a Custom Style Provider |
137 | | -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 87 | +The formatter settings affect all DataFrames displayed after configuration. |
| 88 | + |
| 89 | +Custom Style Providers |
| 90 | +--------------------- |
138 | 91 |
|
139 | | -For advanced styling needs: |
| 92 | +For advanced styling needs, you can create a custom style provider: |
140 | 93 |
|
141 | 94 | .. code-block:: python |
142 | 95 |
|
143 | 96 | from datafusion.html_formatter import StyleProvider, configure_formatter |
144 | 97 | |
145 | | - class CustomStyleProvider(StyleProvider): |
| 98 | + class MyStyleProvider(StyleProvider): |
146 | 99 | def get_table_styles(self): |
147 | 100 | return { |
148 | 101 | "table": "border-collapse: collapse; width: 100%;", |
149 | | - "th": "background-color: #4CAF50; color: white; padding: 10px;", |
| 102 | + "th": "background-color: #007bff; color: white; padding: 8px; text-align: left;", |
150 | 103 | "td": "border: 1px solid #ddd; padding: 8px;", |
151 | | - "tr:hover": "background-color: #f5f5f5;", |
| 104 | + "tr:nth-child(even)": "background-color: #f2f2f2;", |
152 | 105 | } |
153 | 106 | |
154 | 107 | def get_value_styles(self, dtype, value): |
| 108 | + """Return custom styles for specific values""" |
155 | 109 | if dtype == "float" and value < 0: |
156 | | - return "color: red; font-weight: bold;" |
| 110 | + return "color: red;" |
157 | 111 | return None |
158 | 112 | |
159 | | - # Apply custom styling |
160 | | - configure_formatter(style_provider=CustomStyleProvider()) |
| 113 | + # Apply the custom style provider |
| 114 | + configure_formatter(style_provider=MyStyleProvider()) |
161 | 115 |
|
162 | | -Managing Display Settings |
163 | | -~~~~~~~~~~~~~~~~~~~~~~~ |
| 116 | +Creating a Custom Formatter |
| 117 | +-------------------------- |
164 | 118 |
|
165 | | -You can temporarily change formatting settings with context managers: |
| 119 | +For complete control over rendering, you can implement a custom formatter: |
166 | 120 |
|
167 | 121 | .. code-block:: python |
168 | 122 |
|
169 | | - from datafusion.html_formatter import formatting_context |
| 123 | + from datafusion.html_formatter import Formatter, get_formatter |
170 | 124 | |
171 | | - # Use different formatting temporarily |
172 | | - with formatting_context(max_rows=5, theme="dark"): |
173 | | - df.show() # Will show only 5 rows with dark theme |
| 125 | + class MyFormatter(Formatter): |
| 126 | + def format_html(self, batches, schema, has_more=False, table_uuid=None): |
| 127 | + # Create your custom HTML here |
| 128 | + html = "<div class='my-custom-table'>" |
| 129 | + # ... formatting logic ... |
| 130 | + html += "</div>" |
| 131 | + return html |
174 | 132 | |
175 | | - # Reset to default formatting |
176 | | - from datafusion.html_formatter import reset_formatter |
177 | | - reset_formatter() |
| 133 | + # Set as the global formatter |
| 134 | + configure_formatter(formatter_class=MyFormatter) |
| 135 | + |
| 136 | + # Or use the formatter just for specific operations |
| 137 | + formatter = get_formatter() |
| 138 | + custom_html = formatter.format_html(batches, schema) |
178 | 139 |
|
179 | | -Converting to Other Formats |
180 | | --------------------------- |
| 140 | +Managing Formatters |
| 141 | +------------------ |
181 | 142 |
|
182 | | -DataFusion DataFrames can be easily converted to other popular formats: |
| 143 | +Reset to default formatting: |
183 | 144 |
|
184 | | -.. ipython:: python |
| 145 | +.. code-block:: python |
185 | 146 |
|
186 | | - # Convert to Arrow Table |
187 | | - arrow_table = df.to_arrow_table() |
188 | | - |
189 | | - # Convert to Pandas DataFrame |
190 | | - pandas_df = df.to_pandas() |
| 147 | + from datafusion.html_formatter import reset_formatter |
191 | 148 | |
192 | | - # Convert to Polars DataFrame |
193 | | - polars_df = df.to_polars() |
| 149 | + # Reset to default settings |
| 150 | + reset_formatter() |
| 151 | +
|
| 152 | +Get the current formatter settings: |
| 153 | + |
| 154 | +.. code-block:: python |
| 155 | +
|
| 156 | + from datafusion.html_formatter import get_formatter |
194 | 157 | |
195 | | - # Convert to Python data structures |
196 | | - python_dict = df.to_pydict() |
197 | | - python_list = df.to_pylist() |
| 158 | + formatter = get_formatter() |
| 159 | + print(formatter.max_rows) |
| 160 | + print(formatter.theme) |
198 | 161 |
|
199 | | -Saving DataFrames |
200 | | ---------------- |
| 162 | +Contextual Formatting |
| 163 | +-------------------- |
201 | 164 |
|
202 | | -You can write DataFrames to various file formats: |
| 165 | +You can also use a context manager to temporarily change formatting settings: |
203 | 166 |
|
204 | | -.. ipython:: python |
| 167 | +.. code-block:: python |
205 | 168 |
|
206 | | - # Write to CSV |
207 | | - df.write_csv("output.csv", with_header=True) |
| 169 | + from datafusion.html_formatter import formatting_context |
208 | 170 | |
209 | | - # Write to Parquet |
210 | | - df.write_parquet("output.parquet", compression="zstd") |
| 171 | + # Default formatting |
| 172 | + df.show() |
| 173 | + |
| 174 | + # Temporarily use different formatting |
| 175 | + with formatting_context(max_rows=100, theme="dark"): |
| 176 | + df.show() # Will use the temporary settings |
211 | 177 | |
212 | | - # Write to JSON |
213 | | - df.write_json("output.json") |
| 178 | + # Back to default formatting |
| 179 | + df.show() |
0 commit comments