feat: create a datafusion-example for in-memory file format (apache#20394)

kumarUjjawal · goldmedal · adriangb · web-flow · commit 2b7d4f9a5b00 · 2026-03-23T12:00:16.000Z
## Which issue does this PR close?  - Should close apache#18802 if we go this route. ## Rationale for this change As mentioned in the original issues: there has been few questions over in discussions about reading csv/json/parquet files from memory instead of from disk/object store. The proposal was to add support for this or create the example to show how to do that. I picked the latter.  ## What changes are included in this PR? - Create example for this  ## Are these changes tested? Yes  ## Are there any user-facing changes? No   --------- Co-authored-by: Jax Liu <liugs963@gmail.com> Co-authored-by: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com>
diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
@@ -88,18 +88,19 @@ cargo run --example dataframe -- dataframe
 
 #### Category: Single Process
 
-| Subcommand           | File Path                                                                                 | Description                                            |
-| -------------------- | ----------------------------------------------------------------------------------------- | ------------------------------------------------------ |
-| catalog              | [`data_io/catalog.rs`](examples/data_io/catalog.rs)                                       | Register tables into a custom catalog                  |
-| json_shredding       | [`data_io/json_shredding.rs`](examples/data_io/json_shredding.rs)                         | Implement filter rewriting for JSON shredding          |
-| parquet_adv_idx      | [`data_io/parquet_advanced_index.rs`](examples/data_io/parquet_advanced_index.rs)         | Create a secondary index across multiple parquet files |
-| parquet_emb_idx      | [`data_io/parquet_embedded_index.rs`](examples/data_io/parquet_embedded_index.rs)         | Store a custom index inside Parquet files              |
-| parquet_enc          | [`data_io/parquet_encrypted.rs`](examples/data_io/parquet_encrypted.rs)                   | Read & write encrypted Parquet files                   |
-| parquet_enc_with_kms | [`data_io/parquet_encrypted_with_kms.rs`](examples/data_io/parquet_encrypted_with_kms.rs) | Encrypted Parquet I/O using a KMS-backed factory       |
-| parquet_exec_visitor | [`data_io/parquet_exec_visitor.rs`](examples/data_io/parquet_exec_visitor.rs)             | Extract statistics by visiting an ExecutionPlan        |
-| parquet_idx          | [`data_io/parquet_index.rs`](examples/data_io/parquet_index.rs)                           | Create a secondary index                               |
-| query_http_csv       | [`data_io/query_http_csv.rs`](examples/data_io/query_http_csv.rs)                         | Query CSV files via HTTP                               |
-| remote_catalog       | [`data_io/remote_catalog.rs`](examples/data_io/remote_catalog.rs)                         | Interact with a remote catalog                         |
+| Subcommand             | File Path                                                                                 | Description                                                               |
+| ---------------------- | ----------------------------------------------------------------------------------------- | ------------------------------------------------------------------------- |
+| catalog                | [`data_io/catalog.rs`](examples/data_io/catalog.rs)                                       | Register tables into a custom catalog                                     |
+| in_memory_object_store | [`data_io/in_memory_object_store.rs`](examples/data_io/in_memory_object_store.rs)         | Read CSV from an in-memory object store (pattern applies to JSON/Parquet) |
+| json_shredding         | [`data_io/json_shredding.rs`](examples/data_io/json_shredding.rs)                         | Implement filter rewriting for JSON shredding                             |
+| parquet_adv_idx        | [`data_io/parquet_advanced_index.rs`](examples/data_io/parquet_advanced_index.rs)         | Create a secondary index across multiple parquet files                    |
+| parquet_emb_idx        | [`data_io/parquet_embedded_index.rs`](examples/data_io/parquet_embedded_index.rs)         | Store a custom index inside Parquet files                                 |
+| parquet_enc            | [`data_io/parquet_encrypted.rs`](examples/data_io/parquet_encrypted.rs)                   | Read & write encrypted Parquet files                                      |
+| parquet_enc_with_kms   | [`data_io/parquet_encrypted_with_kms.rs`](examples/data_io/parquet_encrypted_with_kms.rs) | Encrypted Parquet I/O using a KMS-backed factory                          |
+| parquet_exec_visitor   | [`data_io/parquet_exec_visitor.rs`](examples/data_io/parquet_exec_visitor.rs)             | Extract statistics by visiting an ExecutionPlan                           |
+| parquet_idx            | [`data_io/parquet_index.rs`](examples/data_io/parquet_index.rs)                           | Create a secondary index                                                  |
+| query_http_csv         | [`data_io/query_http_csv.rs`](examples/data_io/query_http_csv.rs)                         | Query CSV files via HTTP                                                  |
+| remote_catalog         | [`data_io/remote_catalog.rs`](examples/data_io/remote_catalog.rs)                         | Interact with a remote catalog                                            |
 
 ## DataFrame Examples
 
diff --git a/datafusion-examples/examples/data_io/in_memory_object_store.rs b/datafusion-examples/examples/data_io/in_memory_object_store.rs
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! This follows the recommended approach: implement the `ObjectStore` trait
+//! (or use an existing implementation), register it with DataFusion, and then
+//! read a URL "path" from that store.
+//! See the in-memory reference implementation:
+//! https://docs.rs/object_store/latest/object_store/memory/struct.InMemory.html
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::assert_batches_eq;
+use datafusion::common::Result;
+use datafusion::execution::object_store::ObjectStoreUrl;
+use datafusion::prelude::{CsvReadOptions, SessionContext};
+use object_store::memory::InMemory;
+use object_store::path::Path;
+use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
+
+/// Demonstrates reading CSV data from an in-memory object store.
+///
+/// The same pattern applies to JSON/Parquet: register a store for a URL
+/// prefix, write bytes into the store, then read via that URL.
+pub async fn in_memory_object_store() -> Result<()> {
+    let store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
+    let ctx = SessionContext::new();
+    let object_store_url = ObjectStoreUrl::parse("memory://")?;
+    // Register a URL prefix to route reads through this object store.
+    ctx.register_object_store(object_store_url.as_ref(), Arc::clone(&store));
+
+    let schema = Schema::new(vec![
+        Field::new("id", DataType::Int64, false),
+        Field::new("name", DataType::Utf8, false),
+    ]);
+
+    println!("=== CSV from memory ===");
+    let csv_path = Path::from("/people.csv");
+    let csv_data = b"id,name\n1,Alice\n2,Bob\n";
+    // Write bytes into the in-memory object store.
+    store
+        .put(&csv_path, PutPayload::from_static(csv_data))
+        .await?;
+    // Read using the URL that matches the registered prefix.
+    let csv = ctx
+        .read_csv(
+            "memory:///people.csv",
+            CsvReadOptions::new().schema(&schema),
+        )
+        .await?
+        .collect()
+        .await?;
+    #[rustfmt::skip]
+    let expected = [
+        "+----+-------+",
+        "| id | name  |",
+        "+----+-------+",
+        "| 1  | Alice |",
+        "| 2  | Bob   |",
+        "+----+-------+",
+    ];
+    assert_batches_eq!(expected, &csv);
+
+    Ok(())
+}
diff --git a/datafusion-examples/examples/data_io/main.rs b/datafusion-examples/examples/data_io/main.rs
@@ -21,7 +21,7 @@
 //!
 //! ## Usage
 //! ```bash
-//! cargo run --example data_io -- [all|catalog|json_shredding|parquet_adv_idx|parquet_emb_idx|parquet_enc_with_kms|parquet_enc|parquet_exec_visitor|parquet_idx|query_http_csv|remote_catalog]
+//! cargo run --example data_io -- [all|catalog|in_memory_object_store|json_shredding|parquet_adv_idx|parquet_emb_idx|parquet_enc_with_kms|parquet_enc|parquet_exec_visitor|parquet_idx|query_http_csv|remote_catalog]
 //! ```
 //!
 //! Each subcommand runs a corresponding example:
@@ -30,6 +30,9 @@
 //! - `catalog`
 //!   (file: catalog.rs, desc: Register tables into a custom catalog)
 //!
+//! - `in_memory_object_store`
+//!   (file: in_memory_object_store.rs, desc: Read CSV from an in-memory object store (pattern applies to JSON/Parquet))
+//!
 //! - `json_shredding`
 //!   (file: json_shredding.rs, desc: Implement filter rewriting for JSON shredding)
 //!
@@ -58,6 +61,7 @@
 //!   (file: remote_catalog.rs, desc: Interact with a remote catalog)
 
 mod catalog;
+mod in_memory_object_store;
 mod json_shredding;
 mod parquet_advanced_index;
 mod parquet_embedded_index;
@@ -77,6 +81,7 @@ use strum_macros::{Display, EnumIter, EnumString, VariantNames};
 enum ExampleKind {
     All,
     Catalog,
+    InMemoryObjectStore,
     JsonShredding,
     ParquetAdvIdx,
     ParquetEmbIdx,
@@ -104,6 +109,9 @@ impl ExampleKind {
                 }
             }
             ExampleKind::Catalog => catalog::catalog().await?,
+            ExampleKind::InMemoryObjectStore => {
+                in_memory_object_store::in_memory_object_store().await?
+            }
             ExampleKind::JsonShredding => json_shredding::json_shredding().await?,
             ExampleKind::ParquetAdvIdx => {
                 parquet_advanced_index::parquet_advanced_index().await?