diff --git a/Cargo.lock b/Cargo.lock index c85a093..c89d2ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -483,6 +483,12 @@ dependencies = [ "serde", ] +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.0" @@ -784,6 +790,15 @@ version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" +[[package]] +name = "conpty" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b72b06487a0d4683349ad74d62e87ad639b09667082b3c495c5b6bab7d84b3da" +dependencies = [ + "windows", +] + [[package]] name = "console" version = "0.16.2" @@ -1752,7 +1767,7 @@ dependencies = [ [[package]] name = "datu" -version = "0.2.4" +version = "0.3.0-alpha" dependencies = [ "anyhow", "arrow", @@ -1764,6 +1779,7 @@ dependencies = [ "csv", "cucumber", "datafusion", + "expectrl", "flt", "futures", "gherkin", @@ -1900,6 +1916,18 @@ version = "3.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dea2df4cf52843e0452895c455a1a2cfbb842a1e7329671acf418fdc53ed4c59" +[[package]] +name = "expectrl" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0e0706d01b4f43adaf7e0fb460e07477c36b74ae60fdeb1d045001bd77b4bd1" +dependencies = [ + "conpty", + "nix 0.26.4", + "ptyprocess", + "regex", +] + [[package]] name = "fallible-streaming-iterator" version = "0.1.9" @@ -1941,7 +1969,7 @@ version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ - "bitflags", + "bitflags 2.11.0", "rustc_version", ] @@ -2173,7 +2201,7 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" dependencies = [ - "bitflags", + "bitflags 2.11.0", "ignore", "walkdir", ] @@ -2718,6 +2746,15 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -2743,13 +2780,26 @@ dependencies = [ "smallvec", ] +[[package]] +name = "nix" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" +dependencies = [ + "bitflags 1.3.2", + "cfg-if", + "libc", + "memoffset", + "pin-utils", +] + [[package]] name = "nix" version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ - "bitflags", + "bitflags 2.11.0", "cfg-if", "cfg_aliases", "libc", @@ -3110,6 +3160,12 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pkg-config" version = "0.3.32" @@ -3229,6 +3285,15 @@ dependencies = [ "cc", ] +[[package]] +name = "ptyprocess" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "101be273c0b1680d7056afddbaa88f02b6e9f2dc161165c30bee9914b6025a79" +dependencies = [ + "nix 0.26.4", +] + [[package]] name = "quad-rand" version = "0.2.3" @@ -3335,7 +3400,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags", + "bitflags 2.11.0", ] [[package]] @@ -3424,7 +3489,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.11.0", "errno", "libc", "linux-raw-sys", @@ -3443,7 +3508,7 @@ version = "17.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564" dependencies = [ - "bitflags", + "bitflags 2.11.0", "cfg-if", "clipboard-win", "fd-lock", @@ -3451,7 +3516,7 @@ dependencies = [ "libc", "log", "memchr", - "nix", + "nix 0.30.1", "radix_trie", "unicode-segmentation", "unicode-width", @@ -4226,7 +4291,7 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags", + "bitflags 2.11.0", "hashbrown 0.15.5", "indexmap", "semver", @@ -4261,6 +4326,15 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "windows" +version = "0.44.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e745dab35a0c4c77aa3ce42d595e13d2003d6902d6b08c9ef5fc326d08da12b" +dependencies = [ + "windows-targets 0.42.2", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -4347,6 +4421,21 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -4380,6 +4469,12 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -4392,6 +4487,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -4404,6 +4505,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -4428,6 +4535,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -4440,6 +4553,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -4452,6 +4571,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -4464,6 +4589,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -4534,7 +4665,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags", + "bitflags 2.11.0", "indexmap", "log", "serde", diff --git a/Cargo.toml b/Cargo.toml index 947ac46..cce938b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "datu" -version = "0.2.4" +version = "0.3.0-alpha" edition = "2024" description = "datu - a data file utility" license = "MIT" @@ -37,6 +37,7 @@ flt = { version = "0.0.2" } [dev-dependencies] criterion = "0.5" cucumber = "0.22.1" +expectrl = "0.8" serde_yaml = "0.9" gherkin = "0.15" tempfile = "3" @@ -48,3 +49,7 @@ harness = false [[test]] name = "features" harness = false + +[[test]] +name = "repl" +harness = false diff --git a/README.md b/README.md index 54138b1..3206c07 100644 --- a/README.md +++ b/README.md @@ -266,6 +266,81 @@ Print the installed `datu` version: datu version ``` +## Interactive Mode (REPL) + +Running `datu` without any command starts an interactive REPL (Read-Eval-Print Loop): + +```sh +datu +> +``` + +In the REPL, you compose data pipelines using the `|>` (pipe) operator to chain functions together. The general pattern is: + +```text +read("input") |> ... |> write("output") +``` + +### Functions + +#### `read(path)` + +Read a data file. Supported formats: Parquet (`.parquet`, `.parq`), Avro (`.avro`), ORC (`.orc`). + +```text +> read("data.parquet") |> write("data.csv") +``` + +#### `write(path)` + +Write data to a file. The output format is inferred from the file extension. Supported formats: CSV (`.csv`), JSON (`.json`), YAML (`.yaml`), Parquet (`.parquet`, `.parq`), Avro (`.avro`), ORC (`.orc`), XLSX (`.xlsx`). + +```text +> read("data.parquet") |> write("output.json") +``` + +#### `select(columns...)` + +Select and reorder columns. Columns can be specified using symbol syntax (`:name`) or string syntax (`"name"`). + +```text +> read("data.parquet") |> select(:id, :email) |> write("subset.csv") +> read("data.parquet") |> select("id", "email") |> write("subset.csv") +``` + +Columns appear in the output in the order they are listed, so `select` can also be used to reorder columns: + +```text +> read("data.parquet") |> select(:email, :id) |> write("reordered.csv") +``` + +#### `head(n)` + +Take the first _n_ rows. + +```text +> read("data.parquet") |> head(10) |> write("first10.csv") +``` + +#### `tail(n)` + +Take the last _n_ rows. + +```text +> read("data.parquet") |> tail(10) |> write("last10.csv") +``` + +### Composing Pipelines + +Functions can be chained in any order to build more complex pipelines: + +```text +> read("users.avro") |> select(:id, :first_name, :email) |> head(5) |> write("top5.json") +> read("data.parquet") |> select(:two, :one) |> tail(1) |> write("last_row.csv") +``` + +--- + ## How it Works Internally Internally, `datu` constructs a pipeline based on the command and arguments. diff --git a/features/cli.feature b/features/cli/cli.feature similarity index 95% rename from features/cli.feature rename to features/cli/cli.feature index 9ad91db..1c05dfe 100644 --- a/features/cli.feature +++ b/features/cli/cli.feature @@ -2,7 +2,7 @@ Feature: CLI Scenario: Print version When I run `datu --version` - Then the output should contain "datu 0.2.4" + Then the first line of the output should be: datu 0.3.0-alpha Scenario: Print help with help subcommand When I run `datu help` diff --git a/features/convert.feature b/features/cli/convert.feature similarity index 100% rename from features/convert.feature rename to features/cli/convert.feature diff --git a/features/count.feature b/features/cli/count.feature similarity index 100% rename from features/count.feature rename to features/cli/count.feature diff --git a/features/head.feature b/features/cli/head.feature similarity index 85% rename from features/head.feature rename to features/cli/head.feature index e21e873..5d62e09 100644 --- a/features/head.feature +++ b/features/cli/head.feature @@ -15,25 +15,25 @@ Feature: Head When I run `datu head fixtures/userdata5.avro` Then the command should succeed And the output should have a header and 10 lines - And the first line should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments + And the first line of the output should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments Scenario: Head Avro with -n 3 When I run `datu head fixtures/userdata5.avro -n 3` Then the command should succeed And the output should have a header and 3 lines - And the first line should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments + And the first line of the output should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments Scenario: Head Parquet with --select When I run `datu head fixtures/userdata.parquet -n 2 --select id,last_name` Then the command should succeed And the output should have a header and 2 lines - And the first line should be: id,last_name + And the first line of the output should be: id,last_name Scenario: Head Avro with --select When I run `datu head fixtures/userdata5.avro -n 2 --select id,email` Then the command should succeed And the output should have a header and 2 lines - And the first line should be: id,email + And the first line of the output should be: id,email Scenario: Head ORC default (10 lines) When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` @@ -41,7 +41,7 @@ Feature: Head When I run `datu head $TEMPDIR/userdata5.orc` Then the command should succeed And the output should have a header and 10 lines - And the first line should be: id,first_name + And the first line of the output should be: id,first_name Scenario: Head ORC with -n 3 When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` @@ -49,7 +49,7 @@ Feature: Head When I run `datu head $TEMPDIR/userdata5.orc -n 3` Then the command should succeed And the output should have a header and 3 lines - And the first line should be: id,first_name + And the first line of the output should be: id,first_name Scenario: Head ORC with --select When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` @@ -57,7 +57,7 @@ Feature: Head When I run `datu head $TEMPDIR/userdata5.orc -n 2 --select id,first_name` Then the command should succeed And the output should have a header and 2 lines - And the first line should be: id,first_name + And the first line of the output should be: id,first_name Scenario: Head ORC with --output csv When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` @@ -65,7 +65,7 @@ Feature: Head When I run `datu head $TEMPDIR/userdata5.orc -n 2 --output csv` Then the command should succeed And the output should have a header and 2 lines - And the first line should be: id,first_name + And the first line of the output should be: id,first_name Scenario: Head ORC with --output json When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` @@ -101,7 +101,7 @@ Feature: Head When I run `datu head fixtures/userdata5.avro -n 2 --output csv` Then the command should succeed And the output should have a header and 2 lines - And the first line should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments + And the first line of the output should be: registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments Scenario: Head Avro with --output json When I run `datu head fixtures/userdata5.avro -n 2 --output json` diff --git a/features/schema.feature b/features/cli/schema.feature similarity index 100% rename from features/schema.feature rename to features/cli/schema.feature diff --git a/features/tail.feature b/features/cli/tail.feature similarity index 77% rename from features/tail.feature rename to features/cli/tail.feature index 7c579e8..bbb826a 100644 --- a/features/tail.feature +++ b/features/cli/tail.feature @@ -4,67 +4,67 @@ Feature: Tail Scenario: Tail Parquet default (10 lines) When I run `datu tail fixtures/table.parquet` Then the command should succeed - And the first line should contain "one" + And the first line of the output should contain "one" Scenario: Tail Parquet with -n 2 When I run `datu tail fixtures/table.parquet -n 2` Then the command should succeed - And the first line should contain "one" - And the first line should contain "two" + And the first line of the output should contain "one" + And the first line of the output should contain "two" Scenario: Tail Avro default (10 lines) When I run `datu tail fixtures/userdata5.avro` Then the command should succeed - And the first line should contain "id" - And the first line should contain "first_name" + And the first line of the output should contain "id" + And the first line of the output should contain "first_name" Scenario: Tail Avro with -n 3 When I run `datu tail fixtures/userdata5.avro -n 3` Then the command should succeed - And the first line should contain "email" + And the first line of the output should contain "email" Scenario: Tail Parquet with --select When I run `datu tail fixtures/table.parquet -n 2 --select two,four` Then the command should succeed - And the first line should contain "two" - And the first line should contain "four" + And the first line of the output should contain "two" + And the first line of the output should contain "four" Scenario: Tail Avro with --select When I run `datu tail fixtures/userdata5.avro -n 2 --select id,email` Then the command should succeed - And the first line should contain "id" - And the first line should contain "email" + And the first line of the output should contain "id" + And the first line of the output should contain "email" Scenario: Tail ORC default (10 lines) When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` Then the command should succeed When I run `datu tail $TEMPDIR/userdata5.orc` Then the command should succeed - And the first line should contain "id" - And the first line should contain "first_name" + And the first line of the output should contain "id" + And the first line of the output should contain "first_name" Scenario: Tail ORC with -n 3 When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` Then the command should succeed When I run `datu tail $TEMPDIR/userdata5.orc -n 3` Then the command should succeed - And the first line should contain "id" + And the first line of the output should contain "id" Scenario: Tail ORC with --select When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` Then the command should succeed When I run `datu tail $TEMPDIR/userdata5.orc -n 2 --select id,first_name` Then the command should succeed - And the first line should contain "id" - And the first line should contain "first_name" + And the first line of the output should contain "id" + And the first line of the output should contain "first_name" Scenario: Tail ORC with --output csv When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` Then the command should succeed When I run `datu tail $TEMPDIR/userdata5.orc -n 2 --output csv` Then the command should succeed - And the first line should contain "id" - And the first line should contain "first_name" + And the first line of the output should contain "id" + And the first line of the output should contain "first_name" Scenario: Tail ORC with --output json When I run `datu convert fixtures/userdata5.avro $TEMPDIR/userdata5.orc --select id,first_name --limit 10` @@ -87,8 +87,8 @@ Feature: Tail Scenario: Tail Parquet with --output csv When I run `datu tail fixtures/table.parquet -n 2 --output csv` Then the command should succeed - And the first line should contain "one" - And the first line should contain "two" + And the first line of the output should contain "one" + And the first line of the output should contain "two" Scenario: Tail Parquet with --output json When I run `datu tail fixtures/table.parquet -n 2 --output json` @@ -100,8 +100,8 @@ Feature: Tail Scenario: Tail Avro with --output csv When I run `datu tail fixtures/userdata5.avro -n 2 --output csv` Then the command should succeed - And the first line should contain "id" - And the first line should contain "first_name" + And the first line of the output should contain "id" + And the first line of the output should contain "first_name" Scenario: Tail Avro with --output json When I run `datu tail fixtures/userdata5.avro -n 2 --output json` diff --git a/features/repl/conversion.feature b/features/repl/conversion.feature new file mode 100644 index 0000000..fd5406c --- /dev/null +++ b/features/repl/conversion.feature @@ -0,0 +1,196 @@ +Feature: Conversion + + Scenario: Parquet to CSV + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.parquet") |> write("$TEMPDIR/userdata.csv") + ``` + Then the file "$TEMPDIR/userdata.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments" + + Scenario: Parquet to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> write("$TEMPDIR/table.json") + ``` + Then the file "$TEMPDIR/table.json" should exist + And that file should be valid JSON + And that file should contain "two" + And that file should contain "foo" + + Scenario: Parquet to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> write("$TEMPDIR/table.yaml") + ``` + Then the file "$TEMPDIR/table.yaml" should exist + And that file should be valid YAML + And that file should contain "two:" + And that file should contain "foo" + + Scenario: Parquet to Avro + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> write("$TEMPDIR/table.avro") + ``` + Then the file "$TEMPDIR/table.avro" should exist + And that file should be valid Avro + + Scenario: Parquet to XLSX + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> write("$TEMPDIR/table.xlsx") + ``` + Then the file "$TEMPDIR/table.xlsx" should exist + And that file should be valid XLSX + + Scenario: Avro to CSV + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.csv") + ``` + Then the file "$TEMPDIR/userdata5.csv" should exist + And that file should be a CSV file + And the first line of that file should contain "id" + And the first line of that file should contain "first_name" + + Scenario: Avro to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.json") + ``` + Then the file "$TEMPDIR/userdata5.json" should exist + And that file should be valid JSON + + Scenario: Avro to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.yaml") + ``` + Then the file "$TEMPDIR/userdata5.yaml" should exist + And that file should be valid YAML + And that file should contain "id:" + And that file should contain "first_name:" + + Scenario: Avro to Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.parquet") + ``` + Then the file "$TEMPDIR/userdata5.parquet" should exist + And that file should be valid Parquet + + Scenario: Avro to ORC + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.orc") + ``` + Then the file "$TEMPDIR/userdata5.orc" should exist + And that file should be valid ORC + + Scenario: Avro to XLSX + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.xlsx") + ``` + Then the file "$TEMPDIR/userdata5.xlsx" should exist + And that file should be valid XLSX + + Scenario: ORC to CSV + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.csv") + ``` + Then the file "$TEMPDIR/userdata_orc.csv" should exist + And that file should be a CSV file + + Scenario: ORC to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.json") + ``` + Then the file "$TEMPDIR/userdata_orc.json" should exist + And that file should be valid JSON + + Scenario: ORC to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.yaml") + ``` + Then the file "$TEMPDIR/userdata_orc.yaml" should exist + And that file should be valid YAML + + Scenario: ORC to Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.parquet") + ``` + Then the file "$TEMPDIR/userdata_orc.parquet" should exist + And that file should be valid Parquet + + Scenario: ORC to Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.avro") + ``` + Then the file "$TEMPDIR/userdata_orc.avro" should exist + And that file should be valid Avro + + Scenario: ORC to XLSX + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.xlsx") + ``` + Then the file "$TEMPDIR/userdata_orc.xlsx" should exist + And that file should be valid XLSX + + Scenario: Parquet to CSV with select + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :four) |> write("$TEMPDIR/table_select.csv") + ``` + Then the file "$TEMPDIR/table_select.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,four" + And that file should have 4 lines + + Scenario: Parquet to CSV with head + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(2) |> write("$TEMPDIR/table_head.csv") + ``` + Then the file "$TEMPDIR/table_head.csv" should exist + And that file should be a CSV file + And that file should have 3 lines + + Scenario: Avro to JSON with select and head + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :first_name, :email) |> head(5) |> write("$TEMPDIR/userdata5_subset.json") + ``` + Then the file "$TEMPDIR/userdata5_subset.json" should exist + And that file should be valid JSON + And that file should contain "id" + And that file should contain "first_name" + And that file should contain "email" + + Scenario: Parquet to YAML with select + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :four) |> write("$TEMPDIR/table_select.yaml") + ``` + Then the file "$TEMPDIR/table_select.yaml" should exist + And that file should be valid YAML + And that file should contain "two:" + And that file should contain "four:" + + Scenario: Avro to CSV with head + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> head(10) |> write("$TEMPDIR/userdata5_head.csv") + ``` + Then the file "$TEMPDIR/userdata5_head.csv" should exist + And that file should be a CSV file + And the first line of that file should contain "id" + And that file should have 11 lines diff --git a/features/repl/head.feature b/features/repl/head.feature new file mode 100644 index 0000000..7a5b33c --- /dev/null +++ b/features/repl/head.feature @@ -0,0 +1,122 @@ +Feature: Head + + Scenario: Head from Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(2) |> write("$TEMPDIR/head_parquet.csv") + ``` + Then the file "$TEMPDIR/head_parquet.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "one,two,three,four,five,__index_level_0__" + And that file should have 3 lines + And that file should contain "foo" + And that file should contain "bar" + + Scenario: Head a single row + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(1) |> write("$TEMPDIR/head_one.csv") + ``` + Then the file "$TEMPDIR/head_one.csv" should exist + And that file should be a CSV file + And that file should have 2 lines + And that file should contain "foo" + + Scenario: Head more rows than available + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(100) |> write("$TEMPDIR/head_all.csv") + ``` + Then the file "$TEMPDIR/head_all.csv" should exist + And that file should be a CSV file + And that file should have 4 lines + + Scenario: Head from Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> head(5) |> write("$TEMPDIR/head_avro.csv") + ``` + Then the file "$TEMPDIR/head_avro.csv" should exist + And that file should be a CSV file + And the first line of that file should contain "id" + And the first line of that file should contain "first_name" + And that file should have 6 lines + + Scenario: Head from ORC + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> head(3) |> write("$TEMPDIR/head_orc.csv") + ``` + Then the file "$TEMPDIR/head_orc.csv" should exist + And that file should be a CSV file + And that file should have 4 lines + + Scenario: Head with select + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :three) |> head(2) |> write("$TEMPDIR/head_select.csv") + ``` + Then the file "$TEMPDIR/head_select.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,three" + And that file should have 3 lines + + Scenario: Head to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(2) |> write("$TEMPDIR/head.json") + ``` + Then the file "$TEMPDIR/head.json" should exist + And that file should be valid JSON + And that file should contain "foo" + And that file should contain "bar" + And that file should have 2 records + + Scenario: Head to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(2) |> write("$TEMPDIR/head.yaml") + ``` + Then the file "$TEMPDIR/head.yaml" should exist + And that file should be valid YAML + And that file should contain "two:" + And that file should contain "foo" + + Scenario: Head to Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> head(10) |> write("$TEMPDIR/head.parquet") + ``` + Then the file "$TEMPDIR/head.parquet" should exist + And that file should be valid Parquet + And that file should have 10 records + + Scenario: Head to Avro + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> head(2) |> write("$TEMPDIR/head.avro") + ``` + Then the file "$TEMPDIR/head.avro" should exist + And that file should be valid Avro + And that file should have 2 records + + Scenario: Head with select from Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :email) |> head(3) |> write("$TEMPDIR/head_select_avro.csv") + ``` + Then the file "$TEMPDIR/head_select_avro.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "id,email" + And that file should have 4 lines + + Scenario: Head with select to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :first_name) |> head(2) |> write("$TEMPDIR/head_select.json") + ``` + Then the file "$TEMPDIR/head_select.json" should exist + And that file should be valid JSON + And that file should contain "id" + And that file should contain "first_name" + And that file should have 2 records diff --git a/features/repl/repl.feature b/features/repl/repl.feature new file mode 100644 index 0000000..69d1e0e --- /dev/null +++ b/features/repl/repl.feature @@ -0,0 +1,8 @@ +Feature: REPL + + Scenario: + When datu is ran without a command + Then the output should be: + ``` + > + ``` diff --git a/features/repl/select.feature b/features/repl/select.feature new file mode 100644 index 0000000..f7657ab --- /dev/null +++ b/features/repl/select.feature @@ -0,0 +1,124 @@ +Feature: Select + + Scenario: Select columns using symbol syntax + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :three) |> write("$TEMPDIR/select_symbols.csv") + ``` + Then the file "$TEMPDIR/select_symbols.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,three" + And that file should have 4 lines + + Scenario: Select columns using string syntax + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select("two", "three") |> write("$TEMPDIR/select_strings.csv") + ``` + Then the file "$TEMPDIR/select_strings.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,three" + And that file should have 4 lines + + Scenario: Select a single column + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two) |> write("$TEMPDIR/select_single.csv") + ``` + Then the file "$TEMPDIR/select_single.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two" + And that file should have 4 lines + + Scenario: Select reorders columns + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:three, :one) |> write("$TEMPDIR/select_reorder.csv") + ``` + Then the file "$TEMPDIR/select_reorder.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "three,one" + + Scenario: Select many columns + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:five, :four, :three, :two, :one) |> write("$TEMPDIR/select_many.csv") + ``` + Then the file "$TEMPDIR/select_many.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "five,four,three,two,one" + And that file should have 4 lines + + Scenario: Select from Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :email) |> write("$TEMPDIR/select_avro.csv") + ``` + Then the file "$TEMPDIR/select_avro.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "id,email" + + Scenario: Select from ORC + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> select(:_col1, :_col2) |> write("$TEMPDIR/select_orc.csv") + ``` + Then the file "$TEMPDIR/select_orc.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "_col1,_col2" + + Scenario: Select with head + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :first_name) |> head(5) |> write("$TEMPDIR/select_head.csv") + ``` + Then the file "$TEMPDIR/select_head.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "id,first_name" + And that file should have 6 lines + + Scenario: Select with tail + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :one) |> tail(1) |> write("$TEMPDIR/select_tail.csv") + ``` + Then the file "$TEMPDIR/select_tail.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,one" + And that file should have 2 lines + + Scenario: Select to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :three) |> write("$TEMPDIR/select.json") + ``` + Then the file "$TEMPDIR/select.json" should exist + And that file should be valid JSON + And that file should contain "two" + And that file should contain "three" + + Scenario: Select to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :three) |> write("$TEMPDIR/select.yaml") + ``` + Then the file "$TEMPDIR/select.yaml" should exist + And that file should be valid YAML + And that file should contain "two:" + And that file should contain "three:" + + Scenario: Select to Avro + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:one, :two) |> write("$TEMPDIR/select.avro") + ``` + Then the file "$TEMPDIR/select.avro" should exist + And that file should be valid Avro + + Scenario: Select to Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :first_name, :email) |> write("$TEMPDIR/select.parquet") + ``` + Then the file "$TEMPDIR/select.parquet" should exist + And that file should be valid Parquet diff --git a/features/repl/tail.feature b/features/repl/tail.feature new file mode 100644 index 0000000..6eff6ad --- /dev/null +++ b/features/repl/tail.feature @@ -0,0 +1,118 @@ +Feature: Tail + + Scenario: Tail from Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(2) |> write("$TEMPDIR/tail_parquet.csv") + ``` + Then the file "$TEMPDIR/tail_parquet.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "one,two,three,four,five,__index_level_0__" + And that file should have 3 lines + And that file should contain "bar" + And that file should contain "baz" + + Scenario: Tail a single row + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(1) |> write("$TEMPDIR/tail_one.csv") + ``` + Then the file "$TEMPDIR/tail_one.csv" should exist + And that file should be a CSV file + And that file should have 2 lines + And that file should contain "baz" + + Scenario: Tail more rows than available + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(100) |> write("$TEMPDIR/tail_all.csv") + ``` + Then the file "$TEMPDIR/tail_all.csv" should exist + And that file should be a CSV file + And that file should have 4 lines + + Scenario: Tail from Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> tail(5) |> write("$TEMPDIR/tail_avro.csv") + ``` + Then the file "$TEMPDIR/tail_avro.csv" should exist + And that file should be a CSV file + And the first line of that file should contain "id" + And the first line of that file should contain "first_name" + And that file should have 6 lines + + Scenario: Tail from ORC + When the REPL is ran and the user types: + ``` + read("fixtures/userdata.orc") |> tail(3) |> write("$TEMPDIR/tail_orc.csv") + ``` + Then the file "$TEMPDIR/tail_orc.csv" should exist + And that file should be a CSV file + And that file should have 4 lines + + Scenario: Tail with select + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> select(:two, :three) |> tail(2) |> write("$TEMPDIR/tail_select.csv") + ``` + Then the file "$TEMPDIR/tail_select.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "two,three" + And that file should have 3 lines + + Scenario: Tail to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(2) |> write("$TEMPDIR/tail.json") + ``` + Then the file "$TEMPDIR/tail.json" should exist + And that file should be valid JSON + And that file should contain "bar" + And that file should contain "baz" + + Scenario: Tail to YAML + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(2) |> write("$TEMPDIR/tail.yaml") + ``` + Then the file "$TEMPDIR/tail.yaml" should exist + And that file should be valid YAML + And that file should contain "two:" + And that file should contain "baz" + + Scenario: Tail to Parquet + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> tail(10) |> write("$TEMPDIR/tail.parquet") + ``` + Then the file "$TEMPDIR/tail.parquet" should exist + And that file should be valid Parquet + + Scenario: Tail to Avro + When the REPL is ran and the user types: + ``` + read("fixtures/table.parquet") |> tail(2) |> write("$TEMPDIR/tail.avro") + ``` + Then the file "$TEMPDIR/tail.avro" should exist + And that file should be valid Avro + + Scenario: Tail with select from Avro + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :email) |> tail(3) |> write("$TEMPDIR/tail_select_avro.csv") + ``` + Then the file "$TEMPDIR/tail_select_avro.csv" should exist + And that file should be a CSV file + And the first line of that file should be: "id,email" + And that file should have 4 lines + + Scenario: Tail with select to JSON + When the REPL is ran and the user types: + ``` + read("fixtures/userdata5.avro") |> select(:id, :first_name) |> tail(2) |> write("$TEMPDIR/tail_select.json") + ``` + Then the file "$TEMPDIR/tail_select.json" should exist + And that file should be valid JSON + And that file should contain "id" + And that file should contain "first_name" diff --git a/src/cli/repl.rs b/src/cli/repl.rs index b96d260..e3ab2a0 100644 --- a/src/cli/repl.rs +++ b/src/cli/repl.rs @@ -18,6 +18,7 @@ enum PipelineStage { Select { columns: Vec }, Head { n: usize }, Tail { n: usize }, + Count, Write { path: String }, Print, } @@ -97,6 +98,7 @@ impl ReplPipelineBuilder { PipelineStage::Select { columns } => self.exec_select(&columns).await, PipelineStage::Head { n } => self.exec_head(n), PipelineStage::Tail { n } => self.exec_tail(n), + PipelineStage::Count => self.exec_count(), PipelineStage::Write { path } => self.exec_write(&path).await, PipelineStage::Print => self.print_batches(), } @@ -172,6 +174,16 @@ impl ReplPipelineBuilder { Ok(()) } + /// Counts the total number of rows across all batches and prints the result. + fn exec_count(&mut self) -> crate::Result<()> { + let batches = self.batches.take().ok_or_else(|| { + Error::GenericError("count requires a preceding read in the pipe".to_string()) + })?; + let total: usize = batches.iter().map(|b| b.num_rows()).sum(); + println!("{total}"); + Ok(()) + } + /// Writes the batches in context to an output file. async fn exec_write(&mut self, output_path: &str) -> crate::Result<()> { let batches = self.batches.take().ok_or_else(|| { @@ -267,6 +279,10 @@ impl ReplPipelineBuilder { let path = extract_path_from_args("write", &args)?; self.exec_write(&path).await } + + fn eval_count(&mut self) -> crate::Result<()> { + self.exec_count() + } } #[cfg(test)] @@ -305,6 +321,14 @@ fn plan_stage(expr: Expr) -> crate::Result { let n = extract_tail_n(&args)?; Ok(PipelineStage::Tail { n }) } + "count" => { + if !args.is_empty() { + return Err(Error::UnsupportedFunctionCall( + "count takes no arguments".to_string(), + )); + } + Ok(PipelineStage::Count) + } "write" => { let path = extract_path_from_args("write", &args)?; Ok(PipelineStage::Write { path }) @@ -1177,6 +1201,96 @@ mod tests { assert!(ctx.batches.is_none(), "batches consumed by write"); } + // ── plan_stage: count ──────────────────────────────────────── + + #[test] + fn test_plan_stage_count() { + let expr = parse("count()"); + let stage = plan_stage(expr).unwrap(); + assert_eq!(stage, PipelineStage::Count); + } + + #[test] + fn test_plan_stage_count_rejects_args() { + let expr = Expr::FunctionCall( + Identifier("count".into()), + vec![Expr::Literal(Literal::String("extra".into()))], + ); + let result = plan_stage(expr); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + Error::UnsupportedFunctionCall(_) + )); + } + + // ── plan_pipeline: count does not auto-append print ───────── + + #[test] + fn test_plan_pipeline_count_no_auto_print() { + let expr = parse(r#"read("a.parquet") |> count()"#); + let mut exprs = Vec::new(); + collect_pipe_stages(expr, &mut exprs); + let pipeline = plan_pipeline(exprs).unwrap(); + assert_eq!(pipeline.len(), 2); + assert_eq!( + pipeline[0], + PipelineStage::Read { + path: "a.parquet".to_string() + } + ); + assert_eq!(pipeline[1], PipelineStage::Count); + } + + // ── eval_count ───────────────────────────────────────────── + + #[tokio::test(flavor = "multi_thread")] + async fn test_eval_count_success() { + let mut ctx = new_context(); + ctx.eval_read(vec![Expr::Literal(Literal::String( + "fixtures/table.parquet".into(), + ))]) + .await + .expect("read"); + + ctx.eval_count().expect("count"); + assert!(ctx.batches.is_none(), "batches consumed by count"); + } + + #[test] + fn test_eval_count_no_preceding_read() { + let mut ctx = new_context(); + let result = ctx.eval_count(); + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), Error::GenericError(_))); + } + + // ── eval_count: pipeline integration ──────────────────────── + + #[tokio::test(flavor = "multi_thread")] + async fn test_repl_pipeline_read_count() { + let expr = parse(r#"read("fixtures/table.parquet") |> count()"#); + let mut ctx = new_context(); + ctx.eval(expr).await.expect("eval"); + assert!(ctx.batches.is_none(), "batches consumed by count"); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_repl_pipeline_read_select_count() { + let expr = parse(r#"read("fixtures/table.parquet") |> select(:one, :two) |> count()"#); + let mut ctx = new_context(); + ctx.eval(expr).await.expect("eval"); + assert!(ctx.batches.is_none(), "batches consumed by count"); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_repl_pipeline_read_head_count() { + let expr = parse(r#"read("fixtures/table.parquet") |> head(2) |> count()"#); + let mut ctx = new_context(); + ctx.eval(expr).await.expect("eval"); + assert!(ctx.batches.is_none(), "batches consumed by count"); + } + #[tokio::test(flavor = "multi_thread")] async fn test_repl_pipeline_read_select_tail_write() { let temp_dir = tempfile::tempdir().expect("tempdir"); diff --git a/tests/features.rs b/tests/features.rs index 445fd3e..d321ef4 100644 --- a/tests/features.rs +++ b/tests/features.rs @@ -74,7 +74,7 @@ fn command_should_succeed(world: &mut CliWorld) { ); } -#[then(regex = r#"^the first line should be: (.+)$"#)] +#[then(regex = r#"^the first line of the output should be: (.+)$"#)] fn first_line_should_be(world: &mut CliWorld, expected: String) { let output = world.output.as_ref().expect("No output captured"); let stdout = String::from_utf8_lossy(&output.stdout); @@ -87,6 +87,17 @@ fn first_line_should_be(world: &mut CliWorld, expected: String) { ); } +#[then(regex = r#"^the first line of the output should contain "(.+)"$"#)] +fn first_line_should_contain(world: &mut CliWorld, expected: String) { + let output = world.output.as_ref().expect("No output captured"); + let stdout = String::from_utf8_lossy(&output.stdout); + let first_line = stdout.lines().next().unwrap_or(""); + assert!( + first_line.contains(&expected), + "Expected first line to contain '{expected}', but got: {first_line}" + ); +} + #[then(regex = r#"^the output should be:$"#)] fn output_should_be_docstring(world: &mut CliWorld, step: &Step) { let expected = step @@ -367,5 +378,5 @@ fn that_file_should_have_n_lines(world: &mut CliWorld, n: usize) { } fn main() { - futures::executor::block_on(CliWorld::run("features")); + futures::executor::block_on(CliWorld::run("features/cli")); } diff --git a/tests/repl.rs b/tests/repl.rs new file mode 100644 index 0000000..efb4127 --- /dev/null +++ b/tests/repl.rs @@ -0,0 +1,340 @@ +use std::io::BufRead; +use std::io::BufReader; +use std::io::Write; +use std::path::Path; +use std::process::Stdio; +use std::time::Duration; + +use cucumber::World; +use cucumber::then; +use cucumber::when; +use expectrl::Expect; +use expectrl::session::OsSession; +use gherkin::Step; +use parquet::file::reader::FileReader; + +const TEMPDIR_PLACEHOLDER: &str = "$TEMPDIR"; + +#[derive(Debug, Default, World)] +pub struct ReplWorld { + session: Option, + temp_dir: Option, + last_file: Option, +} + +fn replace_tempdir(s: &str, temp_path: &str) -> String { + s.replace(TEMPDIR_PLACEHOLDER, temp_path) +} + +fn resolve_path(world: &ReplWorld, path: &str) -> String { + if let Some(ref temp_dir) = world.temp_dir { + let temp_path = temp_dir + .path() + .to_str() + .expect("Temp path is not valid UTF-8"); + replace_tempdir(path, temp_path) + } else { + path.to_string() + } +} + +fn ensure_temp_dir(world: &mut ReplWorld) -> String { + if world.temp_dir.is_none() { + world.temp_dir = Some(tempfile::tempdir().expect("Failed to create temp dir")); + } + world + .temp_dir + .as_ref() + .unwrap() + .path() + .to_str() + .expect("Temp path is not valid UTF-8") + .to_string() +} + +#[when(regex = r#"^datu is ran without a command$"#)] +fn run_datu_repl(world: &mut ReplWorld) { + let datu_path = std::env::var("CARGO_BIN_EXE_datu") + .expect("Environment variable 'CARGO_BIN_EXE_datu' not defined"); + let mut session = expectrl::spawn(datu_path).expect("Failed to spawn REPL"); + session.set_expect_timeout(Some(Duration::from_secs(5))); + world.session = Some(session); +} + +#[when(regex = r#"^the REPL is ran and the user types:$"#)] +fn repl_is_ran_and_user_types(world: &mut ReplWorld, step: &Step) { + let input = step.docstring.as_ref().expect("Step requires a docstring"); + let temp_path = ensure_temp_dir(world); + + let datu_path = std::env::var("CARGO_BIN_EXE_datu") + .expect("Environment variable 'CARGO_BIN_EXE_datu' not defined"); + + let mut child = std::process::Command::new(datu_path) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .expect("Failed to spawn datu REPL"); + + { + let stdin = child.stdin.as_mut().expect("Failed to open stdin"); + for line in input.trim().lines() { + let resolved = replace_tempdir(line.trim(), &temp_path); + writeln!(stdin, "{resolved}").expect("Failed to write to REPL stdin"); + } + } + drop(child.stdin.take()); + + let output = child.wait_with_output().expect("Failed to wait for datu"); + assert!( + output.status.success(), + "datu REPL exited with error.\nstdout: {}\nstderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); +} + +#[then(regex = r#"^the output should be:$"#)] +fn the_output_should_be(world: &mut ReplWorld, step: &Step) { + let expected = step.docstring.as_ref().expect("Step requires a docstring"); + let session = world.session.as_mut().expect("No session running"); + let expected_trimmed = expected.trim(); + session + .expect(expected_trimmed) + .unwrap_or_else(|e| panic!("Expected to find '{expected_trimmed}' in output: {e}")); +} + +#[then(regex = r#"^the file "(.+)" should exist$"#)] +fn file_should_exist(world: &mut ReplWorld, path: String) { + let path_resolved = resolve_path(world, &path); + assert!( + Path::new(&path_resolved).exists(), + "Expected file to exist: {path_resolved}" + ); + world.last_file = Some(path_resolved); +} + +#[then(regex = r#"^that file should be a CSV file$"#)] +fn that_file_should_be_csv(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let mut reader = csv::Reader::from_path(path).expect("Failed to open file as CSV"); + let headers = reader.headers().expect("Failed to read CSV headers"); + assert!( + !headers.is_empty(), + "Expected CSV file to have headers, but got none" + ); +} + +#[then(regex = r#"^the first line of that file should be: "(.+)"$"#)] +fn first_line_of_file_should_be(world: &mut ReplWorld, expected: String) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let first_line = std::io::BufReader::new(file) + .lines() + .next() + .expect("File is empty") + .expect("Failed to read line"); + assert!( + first_line == expected, + "Expected first line to be '{expected}', but got: {first_line}" + ); +} + +#[then(regex = r#"^the first line of that file should contain "(.+)"$"#)] +fn first_line_of_file_should_contain(world: &mut ReplWorld, expected: String) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let first_line = std::io::BufReader::new(file) + .lines() + .next() + .expect("File is empty") + .expect("Failed to read line"); + assert!( + first_line.contains(&expected), + "Expected first line to contain '{expected}', but got: {first_line}" + ); +} + +#[then(regex = r#"^that file should be valid JSON$"#)] +fn that_file_should_be_valid_json(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let content = std::fs::read_to_string(path).expect("Failed to read file"); + serde_json::from_str::(content.trim()) + .expect("Expected file to contain valid JSON, but parsing failed"); +} + +#[then(regex = r#"^that file should be valid YAML$"#)] +fn that_file_should_be_valid_yaml(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let content = std::fs::read_to_string(path).expect("Failed to read file"); + serde_yaml::from_str::(content.trim()) + .expect("Expected file to contain valid YAML, but parsing failed"); +} + +#[then(regex = r#"^that file should be valid Avro$"#)] +fn that_file_should_be_valid_avro(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let reader = arrow_avro::reader::ReaderBuilder::new() + .build(BufReader::new(file)) + .expect("Expected file to be valid Avro, but reading failed"); + let schema = reader.schema(); + assert!( + !schema.fields().is_empty(), + "Expected Avro file to have at least one field" + ); +} + +#[then(regex = r#"^that file should be valid Parquet$"#)] +fn that_file_should_be_valid_parquet(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let reader = parquet::file::reader::SerializedFileReader::new(file) + .expect("Expected file to be valid Parquet, but reading failed"); + let metadata = reader.metadata(); + assert!( + !metadata.file_metadata().schema().get_fields().is_empty(), + "Expected Parquet file to have at least one column" + ); +} + +#[then(regex = r#"^that file should be valid ORC$"#)] +fn that_file_should_be_valid_orc(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let builder = orc_rust::arrow_reader::ArrowReaderBuilder::try_new(file) + .expect("Expected file to be valid ORC, but reading failed"); + let schema = builder.schema(); + assert!( + !schema.fields().is_empty(), + "Expected ORC file to have at least one column" + ); +} + +#[then(regex = r#"^that file should be valid XLSX$"#)] +fn that_file_should_be_valid_xlsx(world: &mut ReplWorld) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let bytes = std::fs::read(path).expect("Failed to read file"); + assert!( + bytes.len() >= 4 && bytes[..4] == [0x50, 0x4B, 0x03, 0x04], + "Expected file to be a valid XLSX (ZIP archive), but magic bytes did not match" + ); +} + +#[then(regex = r#"^that file should contain "(.+)"$"#)] +fn that_file_should_contain(world: &mut ReplWorld, expected: String) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let content = std::fs::read_to_string(path).expect("Failed to read file"); + assert!( + content.contains(&expected), + "Expected file {path} to contain '{expected}', but it did not" + ); +} + +#[then(regex = r#"^that file should have (\d+) lines$"#)] +fn that_file_should_have_n_lines(world: &mut ReplWorld, n: usize) { + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file = std::fs::File::open(path).expect("Failed to open file"); + let line_count = std::io::BufReader::new(file) + .lines() + .filter(|r| r.as_ref().is_ok_and(|s| !s.trim().is_empty())) + .count(); + assert!( + line_count == n, + "Expected file {path} to have {n} lines, but got {line_count}" + ); +} + +#[then(regex = r#"^that file should have (\d+) records$"#)] +fn that_file_should_have_n_records(world: &mut ReplWorld, n: usize) { + use datu::utils::FileType; + + let path = world + .last_file + .as_ref() + .expect("No file has been set; use 'the file \"...\" should exist' first"); + let file_type = FileType::try_from(path.as_str()) + .unwrap_or_else(|e| panic!("Cannot determine file type for {path}: {e}")); + let row_count = match file_type { + FileType::Json => { + let content = std::fs::read_to_string(path).expect("Failed to read file"); + let value: serde_json::Value = + serde_json::from_str(content.trim()).expect("Failed to parse JSON"); + value + .as_array() + .expect("Expected JSON to be an array") + .len() + } + FileType::Parquet => { + let file = std::fs::File::open(path).expect("Failed to open file"); + let reader = parquet::file::reader::SerializedFileReader::new(file) + .expect("Failed to read Parquet file"); + reader + .metadata() + .row_groups() + .iter() + .map(|rg| rg.num_rows()) + .sum::() as usize + } + FileType::Avro => { + let file = std::fs::File::open(path).expect("Failed to open file"); + let reader = arrow_avro::reader::ReaderBuilder::new() + .build(BufReader::new(file)) + .expect("Failed to read Avro file"); + reader + .map(|batch| batch.expect("Failed to read Avro batch").num_rows()) + .sum() + } + FileType::Orc => { + let file = std::fs::File::open(path).expect("Failed to open file"); + let builder = orc_rust::arrow_reader::ArrowReaderBuilder::try_new(file) + .expect("Failed to read ORC file"); + let reader = builder.build(); + reader + .map(|batch| batch.expect("Failed to read ORC batch").num_rows()) + .sum() + } + other => panic!("Unsupported format for record counting: {other:?}"), + }; + assert!( + row_count == n, + "Expected file {path} to have {n} records, but got {row_count}" + ); +} + +fn main() { + futures::executor::block_on(ReplWorld::run("features/repl")); +}