Skip to content

Commit 102c121

Browse files
authored
Merge branch 'apache:main' into test_SpillPool
2 parents 1b8ef43 + 9de192a commit 102c121

File tree

43 files changed

+1884
-482
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1884
-482
lines changed

.github/workflows/rust.yml

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,11 @@ jobs:
709709
./dev/update_function_docs.sh
710710
git diff --exit-code
711711
712+
# This job ensures `datafusion-examples/README.md` stays in sync with the source code:
713+
# 1. Generates README automatically using the Rust examples docs generator
714+
# (parsing documentation from `examples/<group>/main.rs`)
715+
# 2. Formats the generated Markdown using DataFusion's standard Prettier setup
716+
# 3. Compares the result against the committed README.md and fails if out-of-date
712717
examples-docs-check:
713718
name: check example README is up-to-date
714719
needs: linux-build-lib
@@ -721,6 +726,16 @@ jobs:
721726
with:
722727
submodules: true
723728
fetch-depth: 1
729+
730+
- name: Mark repository as safe for git
731+
# Required for git commands inside container (avoids "dubious ownership" error)
732+
run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
733+
734+
- name: Set up Node.js (required for prettier)
735+
# doc_prettier_check.sh uses npx to run prettier for Markdown formatting
736+
uses: actions/setup-node@v4
737+
with:
738+
node-version: '18'
724739

725740
- name: Run examples docs check script
726741
run: |
@@ -778,4 +793,4 @@ jobs:
778793
run: cargo msrv --output-format json --log-target stdout verify
779794
- name: Check datafusion-proto
780795
working-directory: datafusion/proto
781-
run: cargo msrv --output-format json --log-target stdout verify
796+
run: cargo msrv --output-format json --log-target stdout verify

ci/scripts/check_examples_docs.sh

Lines changed: 47 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -17,48 +17,57 @@
1717
# specific language governing permissions and limitations
1818
# under the License.
1919

20-
set -euo pipefail
21-
22-
EXAMPLES_DIR="datafusion-examples/examples"
23-
README="datafusion-examples/README.md"
24-
25-
# ffi examples are skipped because they were not part of the recent example
26-
# consolidation work and do not follow the new grouping and execution pattern.
27-
# They are not documented in the README using the new structure, so including
28-
# them here would cause false CI failures.
29-
SKIP_LIST=("ffi")
30-
31-
missing=0
20+
# Generates documentation for DataFusion examples using the Rust-based
21+
# documentation generator and verifies that the committed README.md
22+
# is up to date.
23+
#
24+
# The README is generated from documentation comments in:
25+
# datafusion-examples/examples/<group>/main.rs
26+
#
27+
# This script is intended to be run in CI to ensure that example
28+
# documentation stays in sync with the code.
29+
#
30+
# To update the README locally, run this script and replace README.md
31+
# with the generated output.
3232

33-
skip() {
34-
local value="$1"
35-
for item in "${SKIP_LIST[@]}"; do
36-
if [[ "$item" == "$value" ]]; then
37-
return 0
38-
fi
39-
done
40-
return 1
41-
}
33+
set -euo pipefail
4234

43-
# collect folder names
44-
folders=$(find "$EXAMPLES_DIR" -mindepth 1 -maxdepth 1 -type d -exec basename {} \;)
35+
ROOT_DIR="$(git rev-parse --show-toplevel)"
36+
EXAMPLES_DIR="$ROOT_DIR/datafusion-examples"
37+
README="$EXAMPLES_DIR/README.md"
38+
README_NEW="$EXAMPLES_DIR/README-NEW.md"
4539

46-
# collect group names from README headers
47-
groups=$(grep "^### Group:" "$README" | sed -E 's/^### Group: `([^`]+)`.*/\1/')
40+
echo "▶ Generating examples README (Rust generator)…"
41+
cargo run --quiet \
42+
--manifest-path "$EXAMPLES_DIR/Cargo.toml" \
43+
--bin examples-docs \
44+
> "$README_NEW"
4845

49-
for folder in $folders; do
50-
if skip "$folder"; then
51-
echo "Skipped group: $folder"
52-
continue
53-
fi
46+
echo "▶ Formatting generated README with Prettier…"
47+
48+
--parser markdown \
49+
--write "$README_NEW"
5450

55-
if ! echo "$groups" | grep -qx "$folder"; then
56-
echo "Missing README entry for example group: $folder"
57-
missing=1
58-
fi
59-
done
51+
echo "▶ Comparing generated README with committed version…"
6052

61-
if [[ $missing -eq 1 ]]; then
62-
echo "README is out of sync with examples"
63-
exit 1
53+
if ! diff -u "$README" "$README_NEW" > /tmp/examples-readme.diff; then
54+
echo ""
55+
echo "❌ Examples README is out of date."
56+
echo ""
57+
echo "The examples documentation is generated automatically from:"
58+
echo " - datafusion-examples/examples/<group>/main.rs"
59+
echo ""
60+
echo "To update the README locally, run:"
61+
echo ""
62+
echo " cargo run --bin examples-docs \\"
63+
echo " | npx [email protected] --parser markdown --write \\"
64+
echo " > datafusion-examples/README.md"
65+
echo ""
66+
echo "Diff:"
67+
echo "------------------------------------------------------------"
68+
cat /tmp/examples-readme.diff
69+
echo "------------------------------------------------------------"
70+
exit 1
6471
fi
72+
73+
echo "✅ Examples README is up-to-date."

datafusion-examples/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ arrow-schema = { workspace = true }
4141
datafusion = { workspace = true, default-features = true, features = ["parquet_encryption"] }
4242
datafusion-common = { workspace = true }
4343
tempfile = { workspace = true }
44-
tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
44+
tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] }
4545

4646
[dev-dependencies]
4747
arrow-flight = { workspace = true }

datafusion-examples/README.md

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -71,15 +71,16 @@ cargo run --example dataframe -- dataframe
7171

7272
#### Category: Single Process
7373

74-
| Subcommand | File Path | Description |
75-
| --------------------- | ----------------------------------------------------------------------------------------------------- | --------------------------------------------- |
76-
| csv_sql_streaming | [`custom_data_source/csv_sql_streaming.rs`](examples/custom_data_source/csv_sql_streaming.rs) | Run a streaming SQL query against CSV data |
77-
| csv_json_opener | [`custom_data_source/csv_json_opener.rs`](examples/custom_data_source/csv_json_opener.rs) | Use low-level FileOpener APIs for CSV/JSON |
78-
| custom_datasource | [`custom_data_source/custom_datasource.rs`](examples/custom_data_source/custom_datasource.rs) | Query a custom TableProvider |
79-
| custom_file_casts | [`custom_data_source/custom_file_casts.rs`](examples/custom_data_source/custom_file_casts.rs) | Implement custom casting rules |
80-
| custom_file_format | [`custom_data_source/custom_file_format.rs`](examples/custom_data_source/custom_file_format.rs) | Write to a custom file format |
81-
| default_column_values | [`custom_data_source/default_column_values.rs`](examples/custom_data_source/default_column_values.rs) | Custom default values using metadata |
82-
| file_stream_provider | [`custom_data_source/file_stream_provider.rs`](examples/custom_data_source/file_stream_provider.rs) | Read/write via FileStreamProvider for streams |
74+
| Subcommand | File Path | Description |
75+
| --------------------- | ----------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
76+
| adapter_serialization | [`custom_data_source/adapter_serialization.rs`](examples/custom_data_source/adapter_serialization.rs) | Preserve custom PhysicalExprAdapter information during plan serialization using PhysicalExtensionCodec interception |
77+
| csv_json_opener | [`custom_data_source/csv_json_opener.rs`](examples/custom_data_source/csv_json_opener.rs) | Use low-level FileOpener APIs for CSV/JSON |
78+
| csv_sql_streaming | [`custom_data_source/csv_sql_streaming.rs`](examples/custom_data_source/csv_sql_streaming.rs) | Run a streaming SQL query against CSV data |
79+
| custom_datasource | [`custom_data_source/custom_datasource.rs`](examples/custom_data_source/custom_datasource.rs) | Query a custom TableProvider |
80+
| custom_file_casts | [`custom_data_source/custom_file_casts.rs`](examples/custom_data_source/custom_file_casts.rs) | Implement custom casting rules |
81+
| custom_file_format | [`custom_data_source/custom_file_format.rs`](examples/custom_data_source/custom_file_format.rs) | Write to a custom file format |
82+
| default_column_values | [`custom_data_source/default_column_values.rs`](examples/custom_data_source/default_column_values.rs) | Custom default values using metadata |
83+
| file_stream_provider | [`custom_data_source/file_stream_provider.rs`](examples/custom_data_source/file_stream_provider.rs) | Read/write via FileStreamProvider for streams |
8384

8485
## Data IO Examples
8586

@@ -143,8 +144,8 @@ cargo run --example dataframe -- dataframe
143144

144145
| Subcommand | File Path | Description |
145146
| ---------- | ------------------------------------------------------- | ------------------------------------------------------ |
146-
| server | [`flight/server.rs`](examples/flight/server.rs) | Run DataFusion server accepting FlightSQL/JDBC queries |
147147
| client | [`flight/client.rs`](examples/flight/client.rs) | Execute SQL queries via Arrow Flight protocol |
148+
| server | [`flight/server.rs`](examples/flight/server.rs) | Run DataFusion server accepting FlightSQL/JDBC queries |
148149
| sql_server | [`flight/sql_server.rs`](examples/flight/sql_server.rs) | Standalone SQL server for JDBC clients |
149150

150151
## Proto Examples
@@ -153,9 +154,10 @@ cargo run --example dataframe -- dataframe
153154

154155
#### Category: Single Process
155156

156-
| Subcommand | File Path | Description |
157-
| ------------------------ | --------------------------------------------------------------------------------- | --------------------------------------------------------------- |
158-
| composed_extension_codec | [`proto/composed_extension_codec.rs`](examples/proto/composed_extension_codec.rs) | Use multiple extension codecs for serialization/deserialization |
157+
| Subcommand | File Path | Description |
158+
| ------------------------ | --------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- |
159+
| composed_extension_codec | [`proto/composed_extension_codec.rs`](examples/proto/composed_extension_codec.rs) | Use multiple extension codecs for serialization/deserialization |
160+
| expression_deduplication | [`proto/expression_deduplication.rs`](examples/proto/expression_deduplication.rs) | Example of expression caching/deduplication using the codec decorator pattern |
159161

160162
## Query Planning Examples
161163

datafusion-examples/examples/builtin_functions/main.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,15 @@
2626
//!
2727
//! Each subcommand runs a corresponding example:
2828
//! - `all` — run all examples included in this module
29-
//! - `date_time` — examples of date-time related functions and queries
30-
//! - `function_factory` — register `CREATE FUNCTION` handler to implement SQL macros
31-
//! - `regexp` — examples of using regular expression functions
29+
//!
30+
//! - `date_time`
31+
//! (file: date_time.rs, desc: Examples of date-time related functions and queries)
32+
//!
33+
//! - `function_factory`
34+
//! (file: function_factory.rs, desc: Register `CREATE FUNCTION` handler to implement SQL macros)
35+
//!
36+
//! - `regexp`
37+
//! (file: regexp.rs, desc: Examples of using regular expression functions)
3238
3339
mod date_time;
3440
mod function_factory;

datafusion-examples/examples/custom_data_source/main.rs

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,30 @@
2626
//!
2727
//! Each subcommand runs a corresponding example:
2828
//! - `all` — run all examples included in this module
29-
//! - `adapter_serialization` — preserve custom PhysicalExprAdapter information during plan serialization using PhysicalExtensionCodec interception
30-
//! - `csv_json_opener` — use low level FileOpener APIs to read CSV/JSON into Arrow RecordBatches
31-
//! - `csv_sql_streaming` — build and run a streaming query plan from a SQL statement against a local CSV file
32-
//! - `custom_datasource` — run queries against a custom datasource (TableProvider)
33-
//! - `custom_file_casts` — implement custom casting rules to adapt file schemas
34-
//! - `custom_file_format` — write data to a custom file format
35-
//! - `default_column_values` — implement custom default value handling for missing columns using field metadata and PhysicalExprAdapter
36-
//! - `file_stream_provider` — run a query on FileStreamProvider which implements StreamProvider for reading and writing to arbitrary stream sources/sinks
29+
//!
30+
//! - `adapter_serialization`
31+
//! (file: adapter_serialization.rs, desc: Preserve custom PhysicalExprAdapter information during plan serialization using PhysicalExtensionCodec interception)
32+
//!
33+
//! - `csv_json_opener`
34+
//! (file: csv_json_opener.rs, desc: Use low-level FileOpener APIs for CSV/JSON)
35+
//!
36+
//! - `csv_sql_streaming`
37+
//! (file: csv_sql_streaming.rs, desc: Run a streaming SQL query against CSV data)
38+
//!
39+
//! - `custom_datasource`
40+
//! (file: custom_datasource.rs, desc: Query a custom TableProvider)
41+
//!
42+
//! - `custom_file_casts`
43+
//! (file: custom_file_casts.rs, desc: Implement custom casting rules)
44+
//!
45+
//! - `custom_file_format`
46+
//! (file: custom_file_format.rs, desc: Write to a custom file format)
47+
//!
48+
//! - `default_column_values`
49+
//! (file: default_column_values.rs, desc: Custom default values using metadata)
50+
//!
51+
//! - `file_stream_provider`
52+
//! (file: file_stream_provider.rs, desc: Read/write via FileStreamProvider for streams)
3753
3854
mod adapter_serialization;
3955
mod csv_json_opener;

datafusion-examples/examples/data_io/main.rs

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,36 @@
2626
//!
2727
//! Each subcommand runs a corresponding example:
2828
//! - `all` — run all examples included in this module
29-
//! - `catalog` — register the table into a custom catalog
30-
//! - `json_shredding` — shows how to implement custom filter rewriting for JSON shredding
31-
//! - `parquet_adv_idx` — create a detailed secondary index that covers the contents of several parquet files
32-
//! - `parquet_emb_idx` — store a custom index inside a Parquet file and use it to speed up queries
33-
//! - `parquet_enc_with_kms` — read and write encrypted Parquet files using an encryption factory
34-
//! - `parquet_enc` — read and write encrypted Parquet files using DataFusion
35-
//! - `parquet_exec_visitor` — extract statistics by visiting an ExecutionPlan after execution
36-
//! - `parquet_idx` — create an secondary index over several parquet files and use it to speed up queries
37-
//! - `query_http_csv` — configure `object_store` and run a query against files via HTTP
38-
//! - `remote_catalog` — interfacing with a remote catalog (e.g. over a network)
29+
//!
30+
//! - `catalog`
31+
//! (file: catalog.rs, desc: Register tables into a custom catalog)
32+
//!
33+
//! - `json_shredding`
34+
//! (file: json_shredding.rs, desc: Implement filter rewriting for JSON shredding)
35+
//!
36+
//! - `parquet_adv_idx`
37+
//! (file: parquet_advanced_index.rs, desc: Create a secondary index across multiple parquet files)
38+
//!
39+
//! - `parquet_emb_idx`
40+
//! (file: parquet_embedded_index.rs, desc: Store a custom index inside Parquet files)
41+
//!
42+
//! - `parquet_enc`
43+
//! (file: parquet_encrypted.rs, desc: Read & write encrypted Parquet files)
44+
//!
45+
//! - `parquet_enc_with_kms`
46+
//! (file: parquet_encrypted_with_kms.rs, desc: Encrypted Parquet I/O using a KMS-backed factory)
47+
//!
48+
//! - `parquet_exec_visitor`
49+
//! (file: parquet_exec_visitor.rs, desc: Extract statistics by visiting an ExecutionPlan)
50+
//!
51+
//! - `parquet_idx`
52+
//! (file: parquet_index.rs, desc: Create a secondary index)
53+
//!
54+
//! - `query_http_csv`
55+
//! (file: query_http_csv.rs, desc: Query CSV files via HTTP)
56+
//!
57+
//! - `remote_catalog`
58+
//! (file: remote_catalog.rs, desc: Interact with a remote catalog)
3959
4060
mod catalog;
4161
mod json_shredding;

datafusion-examples/examples/dataframe/main.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,15 @@
2626
//!
2727
//! Each subcommand runs a corresponding example:
2828
//! - `all` — run all examples included in this module
29-
//! - `dataframe` — run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries
30-
//! - `deserialize_to_struct` — convert query results (Arrow ArrayRefs) into Rust structs
29+
//!
30+
//! - `cache_factory`
31+
//! (file: cache_factory.rs, desc: Custom lazy caching for DataFrames using `CacheFactory`)
32+
//
33+
//! - `dataframe`
34+
//! (file: dataframe.rs, desc: Query DataFrames from various sources and write output)
35+
//!
36+
//! - `deserialize_to_struct`
37+
//! (file: deserialize_to_struct.rs, desc: Convert Arrow arrays into Rust structs)
3138
3239
mod cache_factory;
3340
mod dataframe;

datafusion-examples/examples/execution_monitoring/main.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,15 @@
2626
//!
2727
//! Each subcommand runs a corresponding example:
2828
//! - `all` — run all examples included in this module
29-
//! - `mem_pool_exec_plan` — shows how to implement memory-aware ExecutionPlan with memory reservation and spilling
30-
//! - `mem_pool_tracking` — demonstrates TrackConsumersPool for memory tracking and debugging with enhanced error messages
31-
//! - `tracing` — demonstrates the tracing injection feature for the DataFusion runtime
29+
//!
30+
//! - `mem_pool_exec_plan`
31+
//! (file: memory_pool_execution_plan.rs, desc: Memory-aware ExecutionPlan with spilling)
32+
//!
33+
//! - `mem_pool_tracking`
34+
//! (file: memory_pool_tracking.rs, desc: Demonstrates memory tracking)
35+
//!
36+
//! - `tracing`
37+
//! (file: tracing.rs, desc: Demonstrates tracing integration)
3238
3339
mod memory_pool_execution_plan;
3440
mod memory_pool_tracking;

datafusion-examples/examples/external_dependency/main.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,12 @@
2626
//!
2727
//! Each subcommand runs a corresponding example:
2828
//! - `all` — run all examples included in this module
29-
//! - `dataframe_to_s3` — run a query using a DataFrame against a parquet file from AWS S3 and writing back to AWS S3
30-
//! - `query_aws_s3` — configure `object_store` and run a query against files stored in AWS S3
29+
//!
30+
//! - `dataframe_to_s3`
31+
//! (file: dataframe_to_s3.rs, desc: Query DataFrames and write results to S3)
32+
//!
33+
//! - `query_aws_s3`
34+
//! (file: query_aws_s3.rs, desc: Query S3-backed data using object_store)
3135
3236
mod dataframe_to_s3;
3337
mod query_aws_s3;

0 commit comments

Comments
 (0)