From da3f143181892758d87050ca8c9ae07d580be621 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sat, 28 Feb 2026 20:28:33 -0500 Subject: [PATCH 01/11] Update tail feature tests for output clarity and add first line validation in Rust tests --- features/tail.feature | 42 +++++++++++++++++++++--------------------- tests/features.rs | 11 +++++++++++ 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/features/tail.feature b/features/tail.feature index 7c579e8..bbb826a 100644 --- a/features/tail.feature +++ b/features/tail.feature @@ -4,67 +4,67 @@ Feature: Tail Scenario: Tail Parquet default (10 lines) When I run `datu tail fixtures/table.parquet` Then the command should succeed - And the first line should contain "one" + And the first line of the output should contain "one" Scenario: Tail Parquet with -n 2 When I run `datu tail fixtures/table.parquet -n 2` Then the command should succeed - And the first line should contain "one" - And the first line should contain "two" + And the first line of the output should contain "one" + And the first line of the output should contain "two" Scenario: Tail Avro default (10 lines) When I run `datu tail fixtures/userdata5.avro` Then the command should succeed - And the first line should contain "id" - And the first line should contain "first_name" + And the first line of the output should contain "id" + And the first line of the output should contain "first_name" Scenario: Tail Avro with -n 3 When I run `datu tail fixtures/userdata5.avro -n 3` Then the command should succeed - And the first line should contain "email" + And the first line of the output should contain "email" Scenario: Tail Parquet with --select When I run `datu tail fixtures/table.parquet -n 2 --select two,four` Then the command should succeed - And the first line should contain "two" - And the first line should contain "four" + And the first line of the output should contain "two" + And the first line of the output should contain "four" Scenario: Tail Avro with --select When I run `datu tail fixtures/userdata5.avro -n 2 --select id,email` Then the command should succeed - And the first line should contain "id" - And the first line should contain "email" + And the first line of the output should contain "id" + And the first line of the output should contain "email" Scenario: Tail ORC default (10 lines) When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` Then the command should succeed When I run `datu tail $TEMPDIR/userdata5.orc` Then the command should succeed - And the first line should contain "id" - And the first line should contain "first_name" + And the first line of the output should contain "id" + And the first line of the output should contain "first_name" Scenario: Tail ORC with -n 3 When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` Then the command should succeed When I run `datu tail $TEMPDIR/userdata5.orc -n 3` Then the command should succeed - And the first line should contain "id" + And the first line of the output should contain "id" Scenario: Tail ORC with --select When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` Then the command should succeed When I run `datu tail $TEMPDIR/userdata5.orc -n 2 --select id,first_name` Then the command should succeed - And the first line should contain "id" - And the first line should contain "first_name" + And the first line of the output should contain "id" + And the first line of the output should contain "first_name" Scenario: Tail ORC with --output csv When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` Then the command should succeed When I run `datu tail $TEMPDIR/userdata5.orc -n 2 --output csv` Then the command should succeed - And the first line should contain "id" - And the first line should contain "first_name" + And the first line of the output should contain "id" + And the first line of the output should contain "first_name" Scenario: Tail ORC with --output json When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` @@ -87,8 +87,8 @@ Feature: Tail Scenario: Tail Parquet with --output csv When I run `datu tail fixtures/table.parquet -n 2 --output csv` Then the command should succeed - And the first line should contain "one" - And the first line should contain "two" + And the first line of the output should contain "one" + And the first line of the output should contain "two" Scenario: Tail Parquet with --output json When I run `datu tail fixtures/table.parquet -n 2 --output json` @@ -100,8 +100,8 @@ Feature: Tail Scenario: Tail Avro with --output csv When I run `datu tail fixtures/userdata5.avro -n 2 --output csv` Then the command should succeed - And the first line should contain "id" - And the first line should contain "first_name" + And the first line of the output should contain "id" + And the first line of the output should contain "first_name" Scenario: Tail Avro with --output json When I run `datu tail fixtures/userdata5.avro -n 2 --output json` diff --git a/tests/features.rs b/tests/features.rs index 445fd3e..e2ce3eb 100644 --- a/tests/features.rs +++ b/tests/features.rs @@ -87,6 +87,17 @@ fn first_line_should_be(world: &mut CliWorld, expected: String) { ); } +#[then(regex = r#"^the first line (?:of the output )?should contain "(.+)"$"#)] +fn first_line_should_contain(world: &mut CliWorld, expected: String) { + let output = world.output.as_ref().expect("No output captured"); + let stdout = String::from_utf8_lossy(&output.stdout); + let first_line = stdout.lines().next().unwrap_or(""); + assert!( + first_line.contains(&expected), + "Expected first line to contain '{expected}', but got: {first_line}" + ); +} + #[then(regex = r#"^the output should be:$"#)] fn output_should_be_docstring(world: &mut CliWorld, step: &Step) { let expected = step From 9dff139671aa50ecd827a3cc316f640f3cff62bd Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sat, 28 Feb 2026 20:29:48 -0500 Subject: [PATCH 02/11] Move cli features to `features/cli` --- features/{ => cli}/cli.feature | 0 features/{ => cli}/convert.feature | 0 features/{ => cli}/count.feature | 0 features/{ => cli}/head.feature | 0 features/{ => cli}/schema.feature | 0 features/{ => cli}/tail.feature | 0 tests/features.rs | 2 +- 7 files changed, 1 insertion(+), 1 deletion(-) rename features/{ => cli}/cli.feature (100%) rename features/{ => cli}/convert.feature (100%) rename features/{ => cli}/count.feature (100%) rename features/{ => cli}/head.feature (100%) rename features/{ => cli}/schema.feature (100%) rename features/{ => cli}/tail.feature (100%) diff --git a/features/cli.feature b/features/cli/cli.feature similarity index 100% rename from features/cli.feature rename to features/cli/cli.feature diff --git a/features/convert.feature b/features/cli/convert.feature similarity index 100% rename from features/convert.feature rename to features/cli/convert.feature diff --git a/features/count.feature b/features/cli/count.feature similarity index 100% rename from features/count.feature rename to features/cli/count.feature diff --git a/features/head.feature b/features/cli/head.feature similarity index 100% rename from features/head.feature rename to features/cli/head.feature diff --git a/features/schema.feature b/features/cli/schema.feature similarity index 100% rename from features/schema.feature rename to features/cli/schema.feature diff --git a/features/tail.feature b/features/cli/tail.feature similarity index 100% rename from features/tail.feature rename to features/cli/tail.feature diff --git a/tests/features.rs b/tests/features.rs index e2ce3eb..b481e1d 100644 --- a/tests/features.rs +++ b/tests/features.rs @@ -378,5 +378,5 @@ fn that_file_should_have_n_lines(world: &mut CliWorld, n: usize) { } fn main() { - futures::executor::block_on(CliWorld::run("features")); + futures::executor::block_on(CliWorld::run("features/cli")); } From b5663ad684dfd065c1b02c0ee847f54eb2e5bfd5 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sat, 28 Feb 2026 20:56:31 -0500 Subject: [PATCH 03/11] Enhance REPL functionality and add feature tests - Introduced a new REPL feature with a corresponding test suite to validate its output. - Added `expectrl` as a dependency for handling REPL interactions. - Created a new `repl.feature` file to define the REPL behavior and expectations. - Implemented `repl.rs` to manage REPL sessions and validate output against expected results. - Updated `Cargo.toml` to include the new `expectrl` dependency and added a test configuration for the REPL. --- Cargo.lock | 149 ++++++++++++++++++++++++++++++++++--- Cargo.toml | 5 ++ features/repl/repl.feature | 8 ++ tests/repl.rs | 36 +++++++++ 4 files changed, 189 insertions(+), 9 deletions(-) create mode 100644 features/repl/repl.feature create mode 100644 tests/repl.rs diff --git a/Cargo.lock b/Cargo.lock index c85a093..b398939 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -483,6 +483,12 @@ dependencies = [ "serde", ] +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.0" @@ -784,6 +790,15 @@ version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" +[[package]] +name = "conpty" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b72b06487a0d4683349ad74d62e87ad639b09667082b3c495c5b6bab7d84b3da" +dependencies = [ + "windows", +] + [[package]] name = "console" version = "0.16.2" @@ -1764,6 +1779,7 @@ dependencies = [ "csv", "cucumber", "datafusion", + "expectrl", "flt", "futures", "gherkin", @@ -1900,6 +1916,18 @@ version = "3.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dea2df4cf52843e0452895c455a1a2cfbb842a1e7329671acf418fdc53ed4c59" +[[package]] +name = "expectrl" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0e0706d01b4f43adaf7e0fb460e07477c36b74ae60fdeb1d045001bd77b4bd1" +dependencies = [ + "conpty", + "nix 0.26.4", + "ptyprocess", + "regex", +] + [[package]] name = "fallible-streaming-iterator" version = "0.1.9" @@ -1941,7 +1969,7 @@ version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ - "bitflags", + "bitflags 2.11.0", "rustc_version", ] @@ -2173,7 +2201,7 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" dependencies = [ - "bitflags", + "bitflags 2.11.0", "ignore", "walkdir", ] @@ -2718,6 +2746,15 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -2743,13 +2780,26 @@ dependencies = [ "smallvec", ] +[[package]] +name = "nix" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" +dependencies = [ + "bitflags 1.3.2", + "cfg-if", + "libc", + "memoffset", + "pin-utils", +] + [[package]] name = "nix" version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ - "bitflags", + "bitflags 2.11.0", "cfg-if", "cfg_aliases", "libc", @@ -3110,6 +3160,12 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pkg-config" version = "0.3.32" @@ -3229,6 +3285,15 @@ dependencies = [ "cc", ] +[[package]] +name = "ptyprocess" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "101be273c0b1680d7056afddbaa88f02b6e9f2dc161165c30bee9914b6025a79" +dependencies = [ + "nix 0.26.4", +] + [[package]] name = "quad-rand" version = "0.2.3" @@ -3335,7 +3400,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags", + "bitflags 2.11.0", ] [[package]] @@ -3424,7 +3489,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.11.0", "errno", "libc", "linux-raw-sys", @@ -3443,7 +3508,7 @@ version = "17.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564" dependencies = [ - "bitflags", + "bitflags 2.11.0", "cfg-if", "clipboard-win", "fd-lock", @@ -3451,7 +3516,7 @@ dependencies = [ "libc", "log", "memchr", - "nix", + "nix 0.30.1", "radix_trie", "unicode-segmentation", "unicode-width", @@ -4226,7 +4291,7 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags", + "bitflags 2.11.0", "hashbrown 0.15.5", "indexmap", "semver", @@ -4261,6 +4326,15 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "windows" +version = "0.44.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e745dab35a0c4c77aa3ce42d595e13d2003d6902d6b08c9ef5fc326d08da12b" +dependencies = [ + "windows-targets 0.42.2", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -4347,6 +4421,21 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -4380,6 +4469,12 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -4392,6 +4487,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -4404,6 +4505,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -4428,6 +4535,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -4440,6 +4553,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -4452,6 +4571,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -4464,6 +4589,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -4534,7 +4665,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags", + "bitflags 2.11.0", "indexmap", "log", "serde", diff --git a/Cargo.toml b/Cargo.toml index 947ac46..b1743a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ flt = { version = "0.0.2" } [dev-dependencies] criterion = "0.5" cucumber = "0.22.1" +expectrl = "0.8" serde_yaml = "0.9" gherkin = "0.15" tempfile = "3" @@ -48,3 +49,7 @@ harness = false [[test]] name = "features" harness = false + +[[test]] +name = "repl" +harness = false diff --git a/features/repl/repl.feature b/features/repl/repl.feature new file mode 100644 index 0000000..f44d675 --- /dev/null +++ b/features/repl/repl.feature @@ -0,0 +1,8 @@ +Feature: REPL + + Scenario: + When I run the REPL + Then the output should be: + ``` + > + ``` diff --git a/tests/repl.rs b/tests/repl.rs new file mode 100644 index 0000000..e499242 --- /dev/null +++ b/tests/repl.rs @@ -0,0 +1,36 @@ +use std::time::Duration; + +use cucumber::World; +use cucumber::then; +use cucumber::when; +use expectrl::Expect; +use expectrl::session::OsSession; +use gherkin::Step; + +#[derive(Debug, Default, World)] +pub struct ReplWorld { + session: Option, +} + +#[when(regex = r#"^I run the REPL$"#)] +fn run_datu_repl(world: &mut ReplWorld) { + let datu_path = std::env::var("CARGO_BIN_EXE_datu") + .expect("Environment variable 'CARGO_BIN_EXE_datu' not defined"); + let mut session = expectrl::spawn(datu_path).expect("Failed to spawn REPL"); + session.set_expect_timeout(Some(Duration::from_secs(5))); + world.session = Some(session); +} + +#[then(regex = r#"^the output should be:$"#)] +fn the_output_should_be(world: &mut ReplWorld, step: &Step) { + let expected = step.docstring.as_ref().expect("Step requires a docstring"); + let session = world.session.as_mut().expect("No session running"); + let expected_trimmed = expected.trim(); + session + .expect(expected_trimmed) + .unwrap_or_else(|e| panic!("Expected to find '{expected_trimmed}' in output: {e}")); +} + +fn main() { + futures::executor::block_on(ReplWorld::run("features/repl")); +} From 425b8c14c10af869906bcb0311d219ecf073556e Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 1 Mar 2026 19:57:16 -0500 Subject: [PATCH 04/11] Update CLI feature tests for version output and first line validation - Changed expected output for the version command from "datu 0.2.4" to "datu 0.3.0-alpha". - Updated the first line validation in the head feature tests to specify "the first line of the output should be" for clarity. - Adjusted regex patterns in Rust tests to match the new output expectations for first line assertions. --- features/cli/cli.feature | 2 +- features/cli/head.feature | 2 +- tests/features.rs | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/features/cli/cli.feature b/features/cli/cli.feature index 9ad91db..1c05dfe 100644 --- a/features/cli/cli.feature +++ b/features/cli/cli.feature @@ -2,7 +2,7 @@ Feature: CLI Scenario: Print version When I run `datu --version` - Then the output should contain "datu 0.2.4" + Then the first line of the output should be: datu 0.3.0-alpha Scenario: Print help with help subcommand When I run `datu help` diff --git a/features/cli/head.feature b/features/cli/head.feature index e21e873..17d65f4 100644 --- a/features/cli/head.feature +++ b/features/cli/head.feature @@ -101,7 +101,7 @@ Feature: Head When I run `datu head fixtures/userdata5.avro -n 2 --output csv` Then the command should succeed And the output should have a header and 2 lines - And the first line should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments + And the first line of the output should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments Scenario: Head Avro with --output json When I run `datu head fixtures/userdata5.avro -n 2 --output json` diff --git a/tests/features.rs b/tests/features.rs index b481e1d..d321ef4 100644 --- a/tests/features.rs +++ b/tests/features.rs @@ -74,7 +74,7 @@ fn command_should_succeed(world: &mut CliWorld) { ); } -#[then(regex = r#"^the first line should be: (.+)$"#)] +#[then(regex = r#"^the first line of the output should be: (.+)$"#)] fn first_line_should_be(world: &mut CliWorld, expected: String) { let output = world.output.as_ref().expect("No output captured"); let stdout = String::from_utf8_lossy(&output.stdout); @@ -87,7 +87,7 @@ fn first_line_should_be(world: &mut CliWorld, expected: String) { ); } -#[then(regex = r#"^the first line (?:of the output )?should contain "(.+)"$"#)] +#[then(regex = r#"^the first line of the output should contain "(.+)"$"#)] fn first_line_should_contain(world: &mut CliWorld, expected: String) { let output = world.output.as_ref().expect("No output captured"); let stdout = String::from_utf8_lossy(&output.stdout); From 36200b25851926f537ee2efe3e82c8a0e5a13ec7 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 1 Mar 2026 19:57:28 -0500 Subject: [PATCH 05/11] Update datu version to 0.3.0-alpha in Cargo.toml and Cargo.lock --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b398939..c89d2ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1767,7 +1767,7 @@ dependencies = [ [[package]] name = "datu" -version = "0.2.4" +version = "0.3.0-alpha" dependencies = [ "anyhow", "arrow", diff --git a/Cargo.toml b/Cargo.toml index b1743a3..cce938b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "datu" -version = "0.2.4" +version = "0.3.0-alpha" edition = "2024" description = "datu - a data file utility" license = "MIT" From ca851200252ef72db0e864d49b7285f8a44c17fd Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 1 Mar 2026 21:17:17 -0500 Subject: [PATCH 06/11] Add conversion feature and implement count command in REPL - Introduced a new feature for converting data from Parquet to CSV format, with a scenario to validate the output file's existence and content. - Added a `Count` variant to the `PipelineStage` enum, enabling row counting across batches in the REPL. - Implemented the `exec_count` method to calculate and print the total number of rows. - Enhanced REPL tests to include scenarios for the new count functionality and conversion feature, ensuring robust validation of expected behaviors. --- features/repl/conversion.feature | 10 +++ features/repl/repl.feature | 2 +- src/cli/repl.rs | 114 ++++++++++++++++++++++++++++++ tests/repl.rs | 115 ++++++++++++++++++++++++++++++- 4 files changed, 239 insertions(+), 2 deletions(-) create mode 100644 features/repl/conversion.feature diff --git a/features/repl/conversion.feature b/features/repl/conversion.feature new file mode 100644 index 0000000..e7bf56a --- /dev/null +++ b/features/repl/conversion.feature @@ -0,0 +1,10 @@ +Feature: Conversion + + Scenario: + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.parquet") |> write("$TEMPDIR/userdata.csv") + ``` + Then the file "$TEMPDIR/userdata.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments" diff --git a/features/repl/repl.feature b/features/repl/repl.feature index f44d675..69d1e0e 100644 --- a/features/repl/repl.feature +++ b/features/repl/repl.feature @@ -1,7 +1,7 @@ Feature: REPL Scenario: - When I run the REPL + When datu is ran without a command Then the output should be: ``` > diff --git a/src/cli/repl.rs b/src/cli/repl.rs index b96d260..e3ab2a0 100644 --- a/src/cli/repl.rs +++ b/src/cli/repl.rs @@ -18,6 +18,7 @@ enum PipelineStage { Select { columns: Vec }, Head { n: usize }, Tail { n: usize }, + Count, Write { path: String }, Print, } @@ -97,6 +98,7 @@ impl ReplPipelineBuilder { PipelineStage::Select { columns } => self.exec_select(&columns).await, PipelineStage::Head { n } => self.exec_head(n), PipelineStage::Tail { n } => self.exec_tail(n), + PipelineStage::Count => self.exec_count(), PipelineStage::Write { path } => self.exec_write(&path).await, PipelineStage::Print => self.print_batches(), } @@ -172,6 +174,16 @@ impl ReplPipelineBuilder { Ok(()) } + /// Counts the total number of rows across all batches and prints the result. + fn exec_count(&mut self) -> crate::Result<()> { + let batches = self.batches.take().ok_or_else(|| { + Error::GenericError("count requires a preceding read in the pipe".to_string()) + })?; + let total: usize = batches.iter().map(|b| b.num_rows()).sum(); + println!("{total}"); + Ok(()) + } + /// Writes the batches in context to an output file. async fn exec_write(&mut self, output_path: &str) -> crate::Result<()> { let batches = self.batches.take().ok_or_else(|| { @@ -267,6 +279,10 @@ impl ReplPipelineBuilder { let path = extract_path_from_args("write", &args)?; self.exec_write(&path).await } + + fn eval_count(&mut self) -> crate::Result<()> { + self.exec_count() + } } #[cfg(test)] @@ -305,6 +321,14 @@ fn plan_stage(expr: Expr) -> crate::Result { let n = extract_tail_n(&args)?; Ok(PipelineStage::Tail { n }) } + "count" => { + if !args.is_empty() { + return Err(Error::UnsupportedFunctionCall( + "count takes no arguments".to_string(), + )); + } + Ok(PipelineStage::Count) + } "write" => { let path = extract_path_from_args("write", &args)?; Ok(PipelineStage::Write { path }) @@ -1177,6 +1201,96 @@ mod tests { assert!(ctx.batches.is_none(), "batches consumed by write"); } + // ── plan_stage: count ──────────────────────────────────────── + + #[test] + fn test_plan_stage_count() { + let expr = parse("count()"); + let stage = plan_stage(expr).unwrap(); + assert_eq!(stage, PipelineStage::Count); + } + + #[test] + fn test_plan_stage_count_rejects_args() { + let expr = Expr::FunctionCall( + Identifier("count".into()), + vec![Expr::Literal(Literal::String("extra".into()))], + ); + let result = plan_stage(expr); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + Error::UnsupportedFunctionCall(_) + )); + } + + // ── plan_pipeline: count does not auto-append print ───────── + + #[test] + fn test_plan_pipeline_count_no_auto_print() { + let expr = parse(r#"read("a.parquet") |> count()"#); + let mut exprs = Vec::new(); + collect_pipe_stages(expr, &mut exprs); + let pipeline = plan_pipeline(exprs).unwrap(); + assert_eq!(pipeline.len(), 2); + assert_eq!( + pipeline[0], + PipelineStage::Read { + path: "a.parquet".to_string() + } + ); + assert_eq!(pipeline[1], PipelineStage::Count); + } + + // ── eval_count ───────────────────────────────────────────── + + #[tokio::test(flavor = "multi_thread")] + async fn test_eval_count_success() { + let mut ctx = new_context(); + ctx.eval_read(vec![Expr::Literal(Literal::String( + "fixtures/table.parquet".into(), + ))]) + .await + .expect("read"); + + ctx.eval_count().expect("count"); + assert!(ctx.batches.is_none(), "batches consumed by count"); + } + + #[test] + fn test_eval_count_no_preceding_read() { + let mut ctx = new_context(); + let result = ctx.eval_count(); + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), Error::GenericError(_))); + } + + // ── eval_count: pipeline integration ──────────────────────── + + #[tokio::test(flavor = "multi_thread")] + async fn test_repl_pipeline_read_count() { + let expr = parse(r#"read("fixtures/table.parquet") |> count()"#); + let mut ctx = new_context(); + ctx.eval(expr).await.expect("eval"); + assert!(ctx.batches.is_none(), "batches consumed by count"); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_repl_pipeline_read_select_count() { + let expr = parse(r#"read("fixtures/table.parquet") |> select(:one, :two) |> count()"#); + let mut ctx = new_context(); + ctx.eval(expr).await.expect("eval"); + assert!(ctx.batches.is_none(), "batches consumed by count"); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_repl_pipeline_read_head_count() { + let expr = parse(r#"read("fixtures/table.parquet") |> head(2) |> count()"#); + let mut ctx = new_context(); + ctx.eval(expr).await.expect("eval"); + assert!(ctx.batches.is_none(), "batches consumed by count"); + } + #[tokio::test(flavor = "multi_thread")] async fn test_repl_pipeline_read_select_tail_write() { let temp_dir = tempfile::tempdir().expect("tempdir"); diff --git a/tests/repl.rs b/tests/repl.rs index e499242..4d0eddc 100644 --- a/tests/repl.rs +++ b/tests/repl.rs @@ -1,3 +1,7 @@ +use std::io::BufRead; +use std::io::Write; +use std::path::Path; +use std::process::Stdio; use std::time::Duration; use cucumber::World; @@ -7,12 +11,46 @@ use expectrl::Expect; use expectrl::session::OsSession; use gherkin::Step; +const TEMPDIR_PLACEHOLDER: &str = "$TEMPDIR"; + #[derive(Debug, Default, World)] pub struct ReplWorld { session: Option, + temp_dir: Option, + last_file: Option, +} + +fn replace_tempdir(s: &str, temp_path: &str) -> String { + s.replace(TEMPDIR_PLACEHOLDER, temp_path) } -#[when(regex = r#"^I run the REPL$"#)] +fn resolve_path(world: &ReplWorld, path: &str) -> String { + if let Some(ref temp_dir) = world.temp_dir { + let temp_path = temp_dir + .path() + .to_str() + .expect("Temp path is not valid UTF-8"); + replace_tempdir(path, temp_path) + } else { + path.to_string() + } +} + +fn ensure_temp_dir(world: &mut ReplWorld) -> String { + if world.temp_dir.is_none() { + world.temp_dir = Some(tempfile::tempdir().expect("Failed to create temp dir")); + } + world + .temp_dir + .as_ref() + .unwrap() + .path() + .to_str() + .expect("Temp path is not valid UTF-8") + .to_string() +} + +#[when(regex = r#"^datu is ran without a command$"#)] fn run_datu_repl(world: &mut ReplWorld) { let datu_path = std::env::var("CARGO_BIN_EXE_datu") .expect("Environment variable 'CARGO_BIN_EXE_datu' not defined"); @@ -21,6 +59,39 @@ fn run_datu_repl(world: &mut ReplWorld) { world.session = Some(session); } +#[when(regex = r#"^the REPL is ran and the user types:$"#)] +fn repl_is_ran_and_user_types(world: &mut ReplWorld, step: &Step) { + let input = step.docstring.as_ref().expect("Step requires a docstring"); + let temp_path = ensure_temp_dir(world); + + let datu_path = std::env::var("CARGO_BIN_EXE_datu") + .expect("Environment variable 'CARGO_BIN_EXE_datu' not defined"); + + let mut child = std::process::Command::new(datu_path) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .expect("Failed to spawn datu REPL"); + + { + let stdin = child.stdin.as_mut().expect("Failed to open stdin"); + for line in input.trim().lines() { + let resolved = replace_tempdir(line.trim(), &temp_path); + writeln!(stdin, "{resolved}").expect("Failed to write to REPL stdin"); + } + } + drop(child.stdin.take()); + + let output = child.wait_with_output().expect("Failed to wait for datu"); + assert!( + output.status.success(), + "datu REPL exited with error.\nstdout: {}\nstderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); +} + #[then(regex = r#"^the output should be:$"#)] fn the_output_should_be(world: &mut ReplWorld, step: &Step) { let expected = step.docstring.as_ref().expect("Step requires a docstring"); @@ -31,6 +102,48 @@ fn the_output_should_be(world: &mut ReplWorld, step: &Step) { .unwrap_or_else(|e| panic!("Expected to find '{expected_trimmed}' in output: {e}")); } +#[then(regex = r#"^the file "(.+)" should exist$"#)] +fn file_should_exist(world: &mut ReplWorld, path: String) { + let path_resolved = resolve_path(world, &path); + assert!( + Path::new(&path_resolved).exists(), + "Expected file to exist: {path_resolved}" + ); + world.last_file = Some(path_resolved); +} + +#[then(regex = r#"^that file should be a CSV file$"#)] +fn that_file_should_be_csv(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let mut reader = csv::Reader::from_path(path).expect("Failed to open file as CSV"); + let headers = reader.headers().expect("Failed to read CSV headers"); + assert!( + !headers.is_empty(), + "Expected CSV file to have headers, but got none" + ); +} + +#[then(regex = r#"^the first line of that file should be: "(.+)"$"#)] +fn first_line_of_file_should_be(world: &mut ReplWorld, expected: String) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let first_line = std::io::BufReader::new(file) + .lines() + .next() + .expect("File is empty") + .expect("Failed to read line"); + assert!( + first_line == expected, + "Expected first line to be '{expected}', but got: {first_line}" + ); +} + fn main() { futures::executor::block_on(ReplWorld::run("features/repl")); } From 212cbf93dfa8020f4329c9ae1b990dc81ff2e869 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 1 Mar 2026 21:37:31 -0500 Subject: [PATCH 07/11] Add comprehensive conversion scenarios and validation in REPL tests - Expanded the conversion feature in the REPL by adding multiple scenarios for converting data between various formats including Parquet, Avro, and ORC to CSV, JSON, YAML, and XLSX. - Implemented new validation steps in the tests to ensure output files exist, are valid in their respective formats, and contain expected content. - Enhanced the REPL test suite with additional assertions for line counts and content validation, improving overall test coverage and reliability. --- features/repl/conversion.feature | 180 ++++++++++++++++++++++++++++++- tests/repl.rs | 70 ++++++++++++ 2 files changed, 249 insertions(+), 1 deletion(-) diff --git a/features/repl/conversion.feature b/features/repl/conversion.feature index e7bf56a..62dc1fb 100644 --- a/features/repl/conversion.feature +++ b/features/repl/conversion.feature @@ -1,6 +1,6 @@ Feature: Conversion - Scenario: + Scenario: Parquet to CSV When the REPL is ran and the user types: ``` read("fixtures/userdata.parquet") |> write("$TEMPDIR/userdata.csv") @@ -8,3 +8,181 @@ Feature: Conversion Then the file "$TEMPDIR/userdata.csv" should exist And that file should be a CSV file And the first line of that file should be: "registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments" + + Scenario: Parquet to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> write("$TEMPDIR/table.json") + ``` + Then the file "$TEMPDIR/table.json" should exist + And that file should be valid JSON + And that file should contain "two" + And that file should contain "foo" + + Scenario: Parquet to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> write("$TEMPDIR/table.yaml") + ``` + Then the file "$TEMPDIR/table.yaml" should exist + And that file should be valid YAML + And that file should contain "two:" + And that file should contain "foo" + + Scenario: Parquet to Avro + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> write("$TEMPDIR/table.avro") + ``` + Then the file "$TEMPDIR/table.avro" should exist + + Scenario: Parquet to XLSX + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> write("$TEMPDIR/table.xlsx") + ``` + Then the file "$TEMPDIR/table.xlsx" should exist + + Scenario: Avro to CSV + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.csv") + ``` + Then the file "$TEMPDIR/userdata5.csv" should exist + And that file should be a CSV file + And the first line of that file should contain "id" + And the first line of that file should contain "first_name" + + Scenario: Avro to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.json") + ``` + Then the file "$TEMPDIR/userdata5.json" should exist + And that file should be valid JSON + + Scenario: Avro to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.yaml") + ``` + Then the file "$TEMPDIR/userdata5.yaml" should exist + And that file should be valid YAML + And that file should contain "id:" + And that file should contain "first_name:" + + Scenario: Avro to Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.parquet") + ``` + Then the file "$TEMPDIR/userdata5.parquet" should exist + + Scenario: Avro to ORC + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.orc") + ``` + Then the file "$TEMPDIR/userdata5.orc" should exist + + Scenario: Avro to XLSX + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.xlsx") + ``` + Then the file "$TEMPDIR/userdata5.xlsx" should exist + + Scenario: ORC to CSV + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.csv") + ``` + Then the file "$TEMPDIR/userdata_orc.csv" should exist + And that file should be a CSV file + + Scenario: ORC to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.json") + ``` + Then the file "$TEMPDIR/userdata_orc.json" should exist + And that file should be valid JSON + + Scenario: ORC to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.yaml") + ``` + Then the file "$TEMPDIR/userdata_orc.yaml" should exist + And that file should be valid YAML + + Scenario: ORC to Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.parquet") + ``` + Then the file "$TEMPDIR/userdata_orc.parquet" should exist + + Scenario: ORC to Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.avro") + ``` + Then the file "$TEMPDIR/userdata_orc.avro" should exist + + Scenario: ORC to XLSX + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.xlsx") + ``` + Then the file "$TEMPDIR/userdata_orc.xlsx" should exist + + Scenario: Parquet to CSV with select + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :four) |> write("$TEMPDIR/table_select.csv") + ``` + Then the file "$TEMPDIR/table_select.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,four" + And that file should have 4 lines + + Scenario: Parquet to CSV with head + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(2) |> write("$TEMPDIR/table_head.csv") + ``` + Then the file "$TEMPDIR/table_head.csv" should exist + And that file should be a CSV file + And that file should have 3 lines + + Scenario: Avro to JSON with select and head + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :first_name, :email) |> head(5) |> write("$TEMPDIR/userdata5_subset.json") + ``` + Then the file "$TEMPDIR/userdata5_subset.json" should exist + And that file should be valid JSON + And that file should contain "id" + And that file should contain "first_name" + And that file should contain "email" + + Scenario: Parquet to YAML with select + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :four) |> write("$TEMPDIR/table_select.yaml") + ``` + Then the file "$TEMPDIR/table_select.yaml" should exist + And that file should be valid YAML + And that file should contain "two:" + And that file should contain "four:" + + Scenario: Avro to CSV with head + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> head(10) |> write("$TEMPDIR/userdata5_head.csv") + ``` + Then the file "$TEMPDIR/userdata5_head.csv" should exist + And that file should be a CSV file + And the first line of that file should contain "id" + And that file should have 11 lines diff --git a/tests/repl.rs b/tests/repl.rs index 4d0eddc..2ba6807 100644 --- a/tests/repl.rs +++ b/tests/repl.rs @@ -144,6 +144,76 @@ fn first_line_of_file_should_be(world: &mut ReplWorld, expected: String) { ); } +#[then(regex = r#"^the first line of that file should contain "(.+)"$"#)] +fn first_line_of_file_should_contain(world: &mut ReplWorld, expected: String) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let first_line = std::io::BufReader::new(file) + .lines() + .next() + .expect("File is empty") + .expect("Failed to read line"); + assert!( + first_line.contains(&expected), + "Expected first line to contain '{expected}', but got: {first_line}" + ); +} + +#[then(regex = r#"^that file should be valid JSON$"#)] +fn that_file_should_be_valid_json(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let content = std::fs::read_to_string(path).expect("Failed to read file"); + serde_json::from_str::(content.trim()) + .expect("Expected file to contain valid JSON, but parsing failed"); +} + +#[then(regex = r#"^that file should be valid YAML$"#)] +fn that_file_should_be_valid_yaml(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let content = std::fs::read_to_string(path).expect("Failed to read file"); + serde_yaml::from_str::(content.trim()) + .expect("Expected file to contain valid YAML, but parsing failed"); +} + +#[then(regex = r#"^that file should contain "(.+)"$"#)] +fn that_file_should_contain(world: &mut ReplWorld, expected: String) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let content = std::fs::read_to_string(path).expect("Failed to read file"); + assert!( + content.contains(&expected), + "Expected file {path} to contain '{expected}', but it did not" + ); +} + +#[then(regex = r#"^that file should have (\d+) lines$"#)] +fn that_file_should_have_n_lines(world: &mut ReplWorld, n: usize) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let line_count = std::io::BufReader::new(file) + .lines() + .filter(|r| r.as_ref().is_ok_and(|s| !s.trim().is_empty())) + .count(); + assert!( + line_count == n, + "Expected file {path} to have {n} lines, but got {line_count}" + ); +} + fn main() { futures::executor::block_on(ReplWorld::run("features/repl")); } From 4eb1d1b41ce7d31a8526c3b45524f2023428f982 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 1 Mar 2026 21:40:58 -0500 Subject: [PATCH 08/11] Add validation steps for output files in REPL conversion scenarios - Enhanced the conversion feature tests by adding validation steps to ensure that output files generated from various conversions (Avro, Parquet, ORC, XLSX) are not only created but also valid according to their respective formats. - Implemented new test functions to check the validity of Avro, Parquet, ORC, and XLSX files, improving the robustness of the REPL test suite and ensuring accurate data handling across formats. --- features/repl/conversion.feature | 8 ++++ tests/repl.rs | 64 ++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/features/repl/conversion.feature b/features/repl/conversion.feature index 62dc1fb..fd5406c 100644 --- a/features/repl/conversion.feature +++ b/features/repl/conversion.feature @@ -35,6 +35,7 @@ Feature: Conversion read("fixtures/table.parquet") |> write("$TEMPDIR/table.avro") ``` Then the file "$TEMPDIR/table.avro" should exist + And that file should be valid Avro Scenario: Parquet to XLSX When the REPL is ran and the user types: @@ -42,6 +43,7 @@ Feature: Conversion read("fixtures/table.parquet") |> write("$TEMPDIR/table.xlsx") ``` Then the file "$TEMPDIR/table.xlsx" should exist + And that file should be valid XLSX Scenario: Avro to CSV When the REPL is ran and the user types: @@ -77,6 +79,7 @@ Feature: Conversion read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.parquet") ``` Then the file "$TEMPDIR/userdata5.parquet" should exist + And that file should be valid Parquet Scenario: Avro to ORC When the REPL is ran and the user types: @@ -84,6 +87,7 @@ Feature: Conversion read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.orc") ``` Then the file "$TEMPDIR/userdata5.orc" should exist + And that file should be valid ORC Scenario: Avro to XLSX When the REPL is ran and the user types: @@ -91,6 +95,7 @@ Feature: Conversion read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.xlsx") ``` Then the file "$TEMPDIR/userdata5.xlsx" should exist + And that file should be valid XLSX Scenario: ORC to CSV When the REPL is ran and the user types: @@ -122,6 +127,7 @@ Feature: Conversion read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.parquet") ``` Then the file "$TEMPDIR/userdata_orc.parquet" should exist + And that file should be valid Parquet Scenario: ORC to Avro When the REPL is ran and the user types: @@ -129,6 +135,7 @@ Feature: Conversion read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.avro") ``` Then the file "$TEMPDIR/userdata_orc.avro" should exist + And that file should be valid Avro Scenario: ORC to XLSX When the REPL is ran and the user types: @@ -136,6 +143,7 @@ Feature: Conversion read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.xlsx") ``` Then the file "$TEMPDIR/userdata_orc.xlsx" should exist + And that file should be valid XLSX Scenario: Parquet to CSV with select When the REPL is ran and the user types: diff --git a/tests/repl.rs b/tests/repl.rs index 2ba6807..e91170f 100644 --- a/tests/repl.rs +++ b/tests/repl.rs @@ -1,4 +1,5 @@ use std::io::BufRead; +use std::io::BufReader; use std::io::Write; use std::path::Path; use std::process::Stdio; @@ -10,6 +11,7 @@ use cucumber::when; use expectrl::Expect; use expectrl::session::OsSession; use gherkin::Step; +use parquet::file::reader::FileReader; const TEMPDIR_PLACEHOLDER: &str = "$TEMPDIR"; @@ -184,6 +186,68 @@ fn that_file_should_be_valid_yaml(world: &mut ReplWorld) { .expect("Expected file to contain valid YAML, but parsing failed"); } +#[then(regex = r#"^that file should be valid Avro$"#)] +fn that_file_should_be_valid_avro(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let reader = arrow_avro::reader::ReaderBuilder::new() + .build(BufReader::new(file)) + .expect("Expected file to be valid Avro, but reading failed"); + let schema = reader.schema(); + assert!( + !schema.fields().is_empty(), + "Expected Avro file to have at least one field" + ); +} + +#[then(regex = r#"^that file should be valid Parquet$"#)] +fn that_file_should_be_valid_parquet(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let reader = parquet::file::reader::SerializedFileReader::new(file) + .expect("Expected file to be valid Parquet, but reading failed"); + let metadata = reader.metadata(); + assert!( + !metadata.file_metadata().schema().get_fields().is_empty(), + "Expected Parquet file to have at least one column" + ); +} + +#[then(regex = r#"^that file should be valid ORC$"#)] +fn that_file_should_be_valid_orc(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let builder = orc_rust::arrow_reader::ArrowReaderBuilder::try_new(file) + .expect("Expected file to be valid ORC, but reading failed"); + let schema = builder.schema(); + assert!( + !schema.fields().is_empty(), + "Expected ORC file to have at least one column" + ); +} + +#[then(regex = r#"^that file should be valid XLSX$"#)] +fn that_file_should_be_valid_xlsx(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let bytes = std::fs::read(path).expect("Failed to read file"); + assert!( + bytes.len() >= 4 && bytes[..4] == [0x50, 0x4B, 0x03, 0x04], + "Expected file to be a valid XLSX (ZIP archive), but magic bytes did not match" + ); +} + #[then(regex = r#"^that file should contain "(.+)"$"#)] fn that_file_should_contain(world: &mut ReplWorld, expected: String) { let path = world From f2988e7e1d3ecb1b458fe1bee244996f28b0b099 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 1 Mar 2026 21:50:13 -0500 Subject: [PATCH 09/11] Add REPL feature tests for head, select, and tail operations - Introduced new feature files for head, select, and tail operations in the REPL, covering various scenarios for reading data from Parquet, Avro, and ORC formats. - Each feature includes validation steps to ensure output files are created, exist in the correct format (CSV, JSON, YAML, Parquet, Avro), and contain expected content. - Enhanced test coverage for data manipulation operations, improving the robustness of the REPL test suite. --- features/repl/head.feature | 118 +++++++++++++++++++++++++++++++++ features/repl/select.feature | 124 +++++++++++++++++++++++++++++++++++ features/repl/tail.feature | 118 +++++++++++++++++++++++++++++++++ 3 files changed, 360 insertions(+) create mode 100644 features/repl/head.feature create mode 100644 features/repl/select.feature create mode 100644 features/repl/tail.feature diff --git a/features/repl/head.feature b/features/repl/head.feature new file mode 100644 index 0000000..cd13d5e --- /dev/null +++ b/features/repl/head.feature @@ -0,0 +1,118 @@ +Feature: Head + + Scenario: Head from Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(2) |> write("$TEMPDIR/head_parquet.csv") + ``` + Then the file "$TEMPDIR/head_parquet.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "one,two,three,four,five,__index_level_0__" + And that file should have 3 lines + And that file should contain "foo" + And that file should contain "bar" + + Scenario: Head a single row + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(1) |> write("$TEMPDIR/head_one.csv") + ``` + Then the file "$TEMPDIR/head_one.csv" should exist + And that file should be a CSV file + And that file should have 2 lines + And that file should contain "foo" + + Scenario: Head more rows than available + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(100) |> write("$TEMPDIR/head_all.csv") + ``` + Then the file "$TEMPDIR/head_all.csv" should exist + And that file should be a CSV file + And that file should have 4 lines + + Scenario: Head from Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> head(5) |> write("$TEMPDIR/head_avro.csv") + ``` + Then the file "$TEMPDIR/head_avro.csv" should exist + And that file should be a CSV file + And the first line of that file should contain "id" + And the first line of that file should contain "first_name" + And that file should have 6 lines + + Scenario: Head from ORC + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> head(3) |> write("$TEMPDIR/head_orc.csv") + ``` + Then the file "$TEMPDIR/head_orc.csv" should exist + And that file should be a CSV file + And that file should have 4 lines + + Scenario: Head with select + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :three) |> head(2) |> write("$TEMPDIR/head_select.csv") + ``` + Then the file "$TEMPDIR/head_select.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,three" + And that file should have 3 lines + + Scenario: Head to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(2) |> write("$TEMPDIR/head.json") + ``` + Then the file "$TEMPDIR/head.json" should exist + And that file should be valid JSON + And that file should contain "foo" + And that file should contain "bar" + + Scenario: Head to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(2) |> write("$TEMPDIR/head.yaml") + ``` + Then the file "$TEMPDIR/head.yaml" should exist + And that file should be valid YAML + And that file should contain "two:" + And that file should contain "foo" + + Scenario: Head to Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> head(10) |> write("$TEMPDIR/head.parquet") + ``` + Then the file "$TEMPDIR/head.parquet" should exist + And that file should be valid Parquet + + Scenario: Head to Avro + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(2) |> write("$TEMPDIR/head.avro") + ``` + Then the file "$TEMPDIR/head.avro" should exist + And that file should be valid Avro + + Scenario: Head with select from Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :email) |> head(3) |> write("$TEMPDIR/head_select_avro.csv") + ``` + Then the file "$TEMPDIR/head_select_avro.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "id,email" + And that file should have 4 lines + + Scenario: Head with select to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :first_name) |> head(2) |> write("$TEMPDIR/head_select.json") + ``` + Then the file "$TEMPDIR/head_select.json" should exist + And that file should be valid JSON + And that file should contain "id" + And that file should contain "first_name" diff --git a/features/repl/select.feature b/features/repl/select.feature new file mode 100644 index 0000000..f7657ab --- /dev/null +++ b/features/repl/select.feature @@ -0,0 +1,124 @@ +Feature: Select + + Scenario: Select columns using symbol syntax + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :three) |> write("$TEMPDIR/select_symbols.csv") + ``` + Then the file "$TEMPDIR/select_symbols.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,three" + And that file should have 4 lines + + Scenario: Select columns using string syntax + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select("two", "three") |> write("$TEMPDIR/select_strings.csv") + ``` + Then the file "$TEMPDIR/select_strings.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,three" + And that file should have 4 lines + + Scenario: Select a single column + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two) |> write("$TEMPDIR/select_single.csv") + ``` + Then the file "$TEMPDIR/select_single.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two" + And that file should have 4 lines + + Scenario: Select reorders columns + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:three, :one) |> write("$TEMPDIR/select_reorder.csv") + ``` + Then the file "$TEMPDIR/select_reorder.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "three,one" + + Scenario: Select many columns + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:five, :four, :three, :two, :one) |> write("$TEMPDIR/select_many.csv") + ``` + Then the file "$TEMPDIR/select_many.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "five,four,three,two,one" + And that file should have 4 lines + + Scenario: Select from Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :email) |> write("$TEMPDIR/select_avro.csv") + ``` + Then the file "$TEMPDIR/select_avro.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "id,email" + + Scenario: Select from ORC + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> select(:_col1, :_col2) |> write("$TEMPDIR/select_orc.csv") + ``` + Then the file "$TEMPDIR/select_orc.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "_col1,_col2" + + Scenario: Select with head + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :first_name) |> head(5) |> write("$TEMPDIR/select_head.csv") + ``` + Then the file "$TEMPDIR/select_head.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "id,first_name" + And that file should have 6 lines + + Scenario: Select with tail + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :one) |> tail(1) |> write("$TEMPDIR/select_tail.csv") + ``` + Then the file "$TEMPDIR/select_tail.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,one" + And that file should have 2 lines + + Scenario: Select to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :three) |> write("$TEMPDIR/select.json") + ``` + Then the file "$TEMPDIR/select.json" should exist + And that file should be valid JSON + And that file should contain "two" + And that file should contain "three" + + Scenario: Select to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :three) |> write("$TEMPDIR/select.yaml") + ``` + Then the file "$TEMPDIR/select.yaml" should exist + And that file should be valid YAML + And that file should contain "two:" + And that file should contain "three:" + + Scenario: Select to Avro + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:one, :two) |> write("$TEMPDIR/select.avro") + ``` + Then the file "$TEMPDIR/select.avro" should exist + And that file should be valid Avro + + Scenario: Select to Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :first_name, :email) |> write("$TEMPDIR/select.parquet") + ``` + Then the file "$TEMPDIR/select.parquet" should exist + And that file should be valid Parquet diff --git a/features/repl/tail.feature b/features/repl/tail.feature new file mode 100644 index 0000000..6eff6ad --- /dev/null +++ b/features/repl/tail.feature @@ -0,0 +1,118 @@ +Feature: Tail + + Scenario: Tail from Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(2) |> write("$TEMPDIR/tail_parquet.csv") + ``` + Then the file "$TEMPDIR/tail_parquet.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "one,two,three,four,five,__index_level_0__" + And that file should have 3 lines + And that file should contain "bar" + And that file should contain "baz" + + Scenario: Tail a single row + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(1) |> write("$TEMPDIR/tail_one.csv") + ``` + Then the file "$TEMPDIR/tail_one.csv" should exist + And that file should be a CSV file + And that file should have 2 lines + And that file should contain "baz" + + Scenario: Tail more rows than available + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(100) |> write("$TEMPDIR/tail_all.csv") + ``` + Then the file "$TEMPDIR/tail_all.csv" should exist + And that file should be a CSV file + And that file should have 4 lines + + Scenario: Tail from Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> tail(5) |> write("$TEMPDIR/tail_avro.csv") + ``` + Then the file "$TEMPDIR/tail_avro.csv" should exist + And that file should be a CSV file + And the first line of that file should contain "id" + And the first line of that file should contain "first_name" + And that file should have 6 lines + + Scenario: Tail from ORC + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> tail(3) |> write("$TEMPDIR/tail_orc.csv") + ``` + Then the file "$TEMPDIR/tail_orc.csv" should exist + And that file should be a CSV file + And that file should have 4 lines + + Scenario: Tail with select + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :three) |> tail(2) |> write("$TEMPDIR/tail_select.csv") + ``` + Then the file "$TEMPDIR/tail_select.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,three" + And that file should have 3 lines + + Scenario: Tail to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(2) |> write("$TEMPDIR/tail.json") + ``` + Then the file "$TEMPDIR/tail.json" should exist + And that file should be valid JSON + And that file should contain "bar" + And that file should contain "baz" + + Scenario: Tail to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(2) |> write("$TEMPDIR/tail.yaml") + ``` + Then the file "$TEMPDIR/tail.yaml" should exist + And that file should be valid YAML + And that file should contain "two:" + And that file should contain "baz" + + Scenario: Tail to Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> tail(10) |> write("$TEMPDIR/tail.parquet") + ``` + Then the file "$TEMPDIR/tail.parquet" should exist + And that file should be valid Parquet + + Scenario: Tail to Avro + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(2) |> write("$TEMPDIR/tail.avro") + ``` + Then the file "$TEMPDIR/tail.avro" should exist + And that file should be valid Avro + + Scenario: Tail with select from Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :email) |> tail(3) |> write("$TEMPDIR/tail_select_avro.csv") + ``` + Then the file "$TEMPDIR/tail_select_avro.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "id,email" + And that file should have 4 lines + + Scenario: Tail with select to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :first_name) |> tail(2) |> write("$TEMPDIR/tail_select.json") + ``` + Then the file "$TEMPDIR/tail_select.json" should exist + And that file should be valid JSON + And that file should contain "id" + And that file should contain "first_name" From 4ffc2bc1df44d33b865d76cc8d29c1a15f83d340 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 1 Mar 2026 22:06:52 -0500 Subject: [PATCH 10/11] Add interactive REPL section to README - Introduced a new section in the README detailing the interactive REPL (Read-Eval-Print Loop) feature of `datu`. - Documented the usage of key functions such as `read`, `write`, `select`, `head`, and `tail`, along with examples for composing data pipelines. - Enhanced user guidance on chaining functions to build complex data processing workflows. --- README.md | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/README.md b/README.md index 54138b1..3a5bf94 100644 --- a/README.md +++ b/README.md @@ -266,6 +266,81 @@ Print the installed `datu` version: datu version ``` +## Interactive Mode (REPL) + +Running `datu` without any command starts an interactive REPL (Read-Eval-Print Loop): + +```sh +datu +> +``` + +In the REPL, you compose data pipelines using the `|>` (pipe) operator to chain functions together. The general pattern is: + +``` +read("input") |> ... |> write("output") +``` + +### Functions + +#### `read(path)` + +Read a data file. Supported formats: Parquet (`.parquet`, `.parq`), Avro (`.avro`), ORC (`.orc`). + +``` +> read("data.parquet") |> write("data.csv") +``` + +#### `write(path)` + +Write data to a file. The output format is inferred from the file extension. Supported formats: CSV (`.csv`), JSON (`.json`), YAML (`.yaml`), Parquet (`.parquet`, `.parq`), Avro (`.avro`), ORC (`.orc`), XLSX (`.xlsx`). + +``` +> read("data.parquet") |> write("output.json") +``` + +#### `select(columns...)` + +Select and reorder columns. Columns can be specified using symbol syntax (`:name`) or string syntax (`"name"`). + +``` +> read("data.parquet") |> select(:id, :email) |> write("subset.csv") +> read("data.parquet") |> select("id", "email") |> write("subset.csv") +``` + +Columns appear in the output in the order they are listed, so `select` can also be used to reorder columns: + +``` +> read("data.parquet") |> select(:email, :id) |> write("reordered.csv") +``` + +#### `head(n)` + +Take the first _n_ rows. + +``` +> read("data.parquet") |> head(10) |> write("first10.csv") +``` + +#### `tail(n)` + +Take the last _n_ rows. + +``` +> read("data.parquet") |> tail(10) |> write("last10.csv") +``` + +### Composing Pipelines + +Functions can be chained in any order to build more complex pipelines: + +``` +> read("users.avro") |> select(:id, :first_name, :email) |> head(5) |> write("top5.json") +> read("data.parquet") |> select(:two, :one) |> tail(1) |> write("last_row.csv") +``` + +--- + ## How it Works Internally Internally, `datu` constructs a pipeline based on the command and arguments. From 8b383d5773dd439c5c56f5e6d5c873852839f78d Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Mon, 2 Mar 2026 17:42:01 -0500 Subject: [PATCH 11/11] Update README and feature tests for output clarity and record validation - Changed code block formatting in the README from plain to text for better readability. - Enhanced feature tests for the `head` command in CLI and REPL by specifying that output files should have a defined number of records. - Added a new step in the REPL tests to validate the number of records in output files, improving test coverage and ensuring accurate data handling. --- README.md | 16 +++++------ features/cli/head.feature | 16 +++++------ features/repl/head.feature | 4 +++ tests/repl.rs | 57 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 3a5bf94..3206c07 100644 --- a/README.md +++ b/README.md @@ -277,7 +277,7 @@ datu In the REPL, you compose data pipelines using the `|>` (pipe) operator to chain functions together. The general pattern is: -``` +```text read("input") |> ... |> write("output") ``` @@ -287,7 +287,7 @@ read("input") |> ... |> write("output") Read a data file. Supported formats: Parquet (`.parquet`, `.parq`), Avro (`.avro`), ORC (`.orc`). -``` +```text > read("data.parquet") |> write("data.csv") ``` @@ -295,7 +295,7 @@ Read a data file. Supported formats: Parquet (`.parquet`, `.parq`), Avro (`.avro Write data to a file. The output format is inferred from the file extension. Supported formats: CSV (`.csv`), JSON (`.json`), YAML (`.yaml`), Parquet (`.parquet`, `.parq`), Avro (`.avro`), ORC (`.orc`), XLSX (`.xlsx`). -``` +```text > read("data.parquet") |> write("output.json") ``` @@ -303,14 +303,14 @@ Write data to a file. The output format is inferred from the file extension. Sup Select and reorder columns. Columns can be specified using symbol syntax (`:name`) or string syntax (`"name"`). -``` +```text > read("data.parquet") |> select(:id, :email) |> write("subset.csv") > read("data.parquet") |> select("id", "email") |> write("subset.csv") ``` Columns appear in the output in the order they are listed, so `select` can also be used to reorder columns: -``` +```text > read("data.parquet") |> select(:email, :id) |> write("reordered.csv") ``` @@ -318,7 +318,7 @@ Columns appear in the output in the order they are listed, so `select` can also Take the first _n_ rows. -``` +```text > read("data.parquet") |> head(10) |> write("first10.csv") ``` @@ -326,7 +326,7 @@ Take the first _n_ rows. Take the last _n_ rows. -``` +```text > read("data.parquet") |> tail(10) |> write("last10.csv") ``` @@ -334,7 +334,7 @@ Take the last _n_ rows. Functions can be chained in any order to build more complex pipelines: -``` +```text > read("users.avro") |> select(:id, :first_name, :email) |> head(5) |> write("top5.json") > read("data.parquet") |> select(:two, :one) |> tail(1) |> write("last_row.csv") ``` diff --git a/features/cli/head.feature b/features/cli/head.feature index 17d65f4..5d62e09 100644 --- a/features/cli/head.feature +++ b/features/cli/head.feature @@ -15,25 +15,25 @@ Feature: Head When I run `datu head fixtures/userdata5.avro` Then the command should succeed And the output should have a header and 10 lines - And the first line should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments + And the first line of the output should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments Scenario: Head Avro with -n 3 When I run `datu head fixtures/userdata5.avro -n 3` Then the command should succeed And the output should have a header and 3 lines - And the first line should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments + And the first line of the output should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments Scenario: Head Parquet with --select When I run `datu head fixtures/userdata.parquet -n 2 --select id,last_name` Then the command should succeed And the output should have a header and 2 lines - And the first line should be: id,last_name + And the first line of the output should be: id,last_name Scenario: Head Avro with --select When I run `datu head fixtures/userdata5.avro -n 2 --select id,email` Then the command should succeed And the output should have a header and 2 lines - And the first line should be: id,email + And the first line of the output should be: id,email Scenario: Head ORC default (10 lines) When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` @@ -41,7 +41,7 @@ Feature: Head When I run `datu head $TEMPDIR/userdata5.orc` Then the command should succeed And the output should have a header and 10 lines - And the first line should be: id,first_name + And the first line of the output should be: id,first_name Scenario: Head ORC with -n 3 When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` @@ -49,7 +49,7 @@ Feature: Head When I run `datu head $TEMPDIR/userdata5.orc -n 3` Then the command should succeed And the output should have a header and 3 lines - And the first line should be: id,first_name + And the first line of the output should be: id,first_name Scenario: Head ORC with --select When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` @@ -57,7 +57,7 @@ Feature: Head When I run `datu head $TEMPDIR/userdata5.orc -n 2 --select id,first_name` Then the command should succeed And the output should have a header and 2 lines - And the first line should be: id,first_name + And the first line of the output should be: id,first_name Scenario: Head ORC with --output csv When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` @@ -65,7 +65,7 @@ Feature: Head When I run `datu head $TEMPDIR/userdata5.orc -n 2 --output csv` Then the command should succeed And the output should have a header and 2 lines - And the first line should be: id,first_name + And the first line of the output should be: id,first_name Scenario: Head ORC with --output json When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` diff --git a/features/repl/head.feature b/features/repl/head.feature index cd13d5e..7a5b33c 100644 --- a/features/repl/head.feature +++ b/features/repl/head.feature @@ -70,6 +70,7 @@ Feature: Head And that file should be valid JSON And that file should contain "foo" And that file should contain "bar" + And that file should have 2 records Scenario: Head to YAML When the REPL is ran and the user types: @@ -88,6 +89,7 @@ Feature: Head ``` Then the file "$TEMPDIR/head.parquet" should exist And that file should be valid Parquet + And that file should have 10 records Scenario: Head to Avro When the REPL is ran and the user types: @@ -96,6 +98,7 @@ Feature: Head ``` Then the file "$TEMPDIR/head.avro" should exist And that file should be valid Avro + And that file should have 2 records Scenario: Head with select from Avro When the REPL is ran and the user types: @@ -116,3 +119,4 @@ Feature: Head And that file should be valid JSON And that file should contain "id" And that file should contain "first_name" + And that file should have 2 records diff --git a/tests/repl.rs b/tests/repl.rs index e91170f..efb4127 100644 --- a/tests/repl.rs +++ b/tests/repl.rs @@ -278,6 +278,63 @@ fn that_file_should_have_n_lines(world: &mut ReplWorld, n: usize) { ); } +#[then(regex = r#"^that file should have (\d+) records$"#)] +fn that_file_should_have_n_records(world: &mut ReplWorld, n: usize) { + use datu::utils::FileType; + + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file_type = FileType::try_from(path.as_str()) + .unwrap_or_else(|e| panic!("Cannot determine file type for {path}: {e}")); + let row_count = match file_type { + FileType::Json => { + let content = std::fs::read_to_string(path).expect("Failed to read file"); + let value: serde_json::Value = + serde_json::from_str(content.trim()).expect("Failed to parse JSON"); + value + .as_array() + .expect("Expected JSON to be an array") + .len() + } + FileType::Parquet => { + let file = std::fs::File::open(path).expect("Failed to open file"); + let reader = parquet::file::reader::SerializedFileReader::new(file) + .expect("Failed to read Parquet file"); + reader + .metadata() + .row_groups() + .iter() + .map(|rg| rg.num_rows()) + .sum::() as usize + } + FileType::Avro => { + let file = std::fs::File::open(path).expect("Failed to open file"); + let reader = arrow_avro::reader::ReaderBuilder::new() + .build(BufReader::new(file)) + .expect("Failed to read Avro file"); + reader + .map(|batch| batch.expect("Failed to read Avro batch").num_rows()) + .sum() + } + FileType::Orc => { + let file = std::fs::File::open(path).expect("Failed to open file"); + let builder = orc_rust::arrow_reader::ArrowReaderBuilder::try_new(file) + .expect("Failed to read ORC file"); + let reader = builder.build(); + reader + .map(|batch| batch.expect("Failed to read ORC batch").num_rows()) + .sum() + } + other => panic!("Unsupported format for record counting: {other:?}"), + }; + assert!( + row_count == n, + "Expected file {path} to have {n} records, but got {row_count}" + ); +} + fn main() { futures::executor::block_on(ReplWorld::run("features/repl")); }