aisrael
diff --git a/‎src/cli/convert.rs‎
Lines changed: 26 additions & 26 deletions b/‎src/cli/convert.rs‎
Lines changed: 26 additions & 26 deletions
diff --git a/‎src/pipeline.rs‎
Lines changed: 12 additions & 37 deletions b/‎src/pipeline.rs‎
Lines changed: 12 additions & 37 deletions
diff --git a/‎src/pipeline/dataframe.rs‎
Lines changed: 0 additions & 11 deletions b/‎src/pipeline/dataframe.rs‎
Lines changed: 0 additions & 11 deletions
@@ -18,7 +18,7 @@ mod tests {
     use super::*;
     use crate::pipeline::Source;
     use crate::pipeline::Step;
-    use crate::pipeline::dataframe::read_dataframe;
+    use crate::pipeline::dataframe::DataFrameReader;
     use crate::pipeline::read_to_batches;
     use crate::pipeline::write_batches;
 
@@ -43,7 +43,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_read_dataframe() {
-        let df = *read_dataframe(
+        let df = *DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             None,
@@ -61,7 +61,7 @@ mod tests {
     #[tokio::test(flavor = "multi_thread")]
     async fn test_read_dataframe_with_select() {
         let select = Some(vec!["one".to_string(), "two".to_string()]);
-        let df = *read_dataframe(
+        let df = *DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             select,
@@ -81,7 +81,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_read_dataframe_with_limit() {
-        let df = *read_dataframe(
+        let df = *DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             None,
@@ -99,7 +99,7 @@ mod tests {
     #[tokio::test(flavor = "multi_thread")]
     async fn test_read_dataframe_with_select_and_limit() {
         let select = Some(vec!["two".to_string()]);
-        let df = *read_dataframe(
+        let df = *DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             select,
@@ -119,7 +119,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_read_dataframe_avro() {
-        let df = *read_dataframe(
+        let df = *DataFrameReader::new(
             "fixtures/userdata5.avro",
             FileType::Avro,
             None,
@@ -136,7 +136,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_read_dataframe_orc() {
-        let df = *read_dataframe("fixtures/userdata.orc", FileType::Orc, None, Some(5), None)
+        let df = *DataFrameReader::new("fixtures/userdata.orc", FileType::Orc, None, Some(5), None)
             .execute(())
             .await
             .unwrap()
@@ -147,7 +147,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_read_dataframe_csv() {
-        let df = *read_dataframe("fixtures/table.csv", FileType::Csv, None, Some(2), None)
+        let df = *DataFrameReader::new("fixtures/table.csv", FileType::Csv, None, Some(2), None)
             .execute(())
             .await
             .unwrap()
@@ -158,7 +158,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_read_dataframe_unsupported_type() {
-        let result = read_dataframe("fixtures/data.json", FileType::Json, None, None, None)
+        let result = DataFrameReader::new("fixtures/data.json", FileType::Json, None, None, None)
             .execute(())
             .await;
         assert!(result.is_err());
@@ -174,7 +174,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_write_dataframe_to_parquet() {
-        let source = read_dataframe(
+        let source = DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             None,
@@ -192,7 +192,7 @@ mod tests {
             .unwrap();
         assert!(std::path::Path::new(&output).exists());
 
-        let df2 = *read_dataframe(&output, FileType::Parquet, None, None, None)
+        let df2 = *DataFrameReader::new(&output, FileType::Parquet, None, None, None)
             .execute(())
             .await
             .unwrap()
@@ -203,7 +203,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_write_dataframe_to_csv() {
-        let source = read_dataframe(
+        let source = DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             None,
@@ -224,7 +224,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_write_dataframe_to_json() {
-        let source = read_dataframe(
+        let source = DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             None,
@@ -246,7 +246,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_write_dataframe_to_json_pretty() {
-        let source = read_dataframe(
+        let source = DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             None,
@@ -269,7 +269,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_write_dataframe_to_yaml() {
-        let source = read_dataframe(
+        let source = DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             None,
@@ -292,7 +292,7 @@ mod tests {
     #[tokio::test(flavor = "multi_thread")]
     async fn test_write_dataframe_to_avro() {
         let select = Some(vec!["two".to_string(), "three".to_string()]);
-        let source = read_dataframe(
+        let source = DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             select,
@@ -310,7 +310,7 @@ mod tests {
             .unwrap();
         assert!(std::path::Path::new(&output).exists());
 
-        let df2 = *read_dataframe(&output, FileType::Avro, None, None, None)
+        let df2 = *DataFrameReader::new(&output, FileType::Avro, None, None, None)
             .execute(())
             .await
             .unwrap()
@@ -322,7 +322,7 @@ mod tests {
     #[tokio::test(flavor = "multi_thread")]
     async fn test_write_dataframe_to_orc() {
         let select = Some(vec!["id".to_string(), "first_name".to_string()]);
-        let source = read_dataframe(
+        let source = DataFrameReader::new(
             "fixtures/userdata5.avro",
             FileType::Avro,
             select,
@@ -340,7 +340,7 @@ mod tests {
             .unwrap();
         assert!(std::path::Path::new(&output).exists());
 
-        let df2 = *read_dataframe(&output, FileType::Orc, None, None, None)
+        let df2 = *DataFrameReader::new(&output, FileType::Orc, None, None, None)
             .execute(())
             .await
             .unwrap()
@@ -351,7 +351,7 @@ mod tests {
 
     #[tokio::test(flavor = "multi_thread")]
     async fn test_write_dataframe_to_xlsx() {
-        let source = read_dataframe(
+        let source = DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             None,
@@ -502,7 +502,7 @@ mod tests {
         let select = Some(vec!["two".to_string(), "three".to_string()]);
         let temp_dir = tempfile::tempdir().unwrap();
 
-        let source = read_dataframe(
+        let source = DataFrameReader::new(
             "fixtures/table.parquet",
             FileType::Parquet,
             select,
@@ -518,7 +518,7 @@ mod tests {
             .await
             .unwrap();
 
-        let source2 = read_dataframe(&avro_path, FileType::Avro, None, None, None)
+        let source2 = DataFrameReader::new(&avro_path, FileType::Avro, None, None, None)
             .execute(())
             .await
             .unwrap();
@@ -528,7 +528,7 @@ mod tests {
             .await
             .unwrap();
 
-        let df3 = *read_dataframe(&parquet_path, FileType::Parquet, None, None, None)
+        let df3 = *DataFrameReader::new(&parquet_path, FileType::Parquet, None, None, None)
             .execute(())
             .await
             .unwrap()
@@ -542,7 +542,7 @@ mod tests {
         let select = Some(vec!["id".to_string(), "first_name".to_string()]);
         let temp_dir = tempfile::tempdir().unwrap();
 
-        let source = read_dataframe(
+        let source = DataFrameReader::new(
             "fixtures/userdata5.avro",
             FileType::Avro,
             select,
@@ -558,7 +558,7 @@ mod tests {
             .await
             .unwrap();
 
-        let source2 = read_dataframe(&orc_path, FileType::Orc, None, None, None)
+        let source2 = DataFrameReader::new(&orc_path, FileType::Orc, None, None, None)
             .execute(())
             .await
             .unwrap();
@@ -568,7 +568,7 @@ mod tests {
             .await
             .unwrap();
 
-        let df3 = *read_dataframe(&parquet_path, FileType::Parquet, None, None, None)
+        let df3 = *DataFrameReader::new(&parquet_path, FileType::Parquet, None, None, None)
             .execute(())
             .await
             .unwrap()
 
@@ -3,12 +3,15 @@
 pub mod avro;
 pub mod csv;
 pub mod dataframe;
+pub mod datasource;
 pub mod display;
 pub mod json;
 pub mod orc;
 pub mod parquet;
+pub mod read;
 pub mod record_batch_filter;
 pub mod select;
+pub mod write;
 pub mod xlsx;
 pub mod yaml;
 
@@ -18,37 +21,12 @@ use futures::StreamExt;
 
 use crate::FileType;
 use crate::Result;
+use crate::pipeline::dataframe::DataFrameReader;
 use crate::pipeline::dataframe::DataFrameSource;
-
-/// Arguments for reading a file (Avro, CSV, Parquet, ORC).
-pub struct ReadArgs {
-    pub path: String,
-    pub limit: Option<usize>,
-    pub offset: Option<usize>,
-    /// When reading CSV: has_header for CsvReadOptions. None is treated as true.
-    pub csv_has_header: Option<bool>,
-}
-
-/// Arguments for writing a file (CSV, Avro, Parquet, ORC, XLSX).
-pub struct WriteArgs {
-    pub path: String,
-}
-
-/// Arguments for writing a JSON file.
-pub struct WriteJsonArgs {
-    pub path: String,
-    /// When true, omit keys with null/missing values. When false, output default values.
-    pub sparse: bool,
-    /// When true, format output with indentation and newlines.
-    pub pretty: bool,
-}
-
-/// Arguments for writing a YAML file.
-pub struct WriteYamlArgs {
-    pub path: String,
-    /// When true, omit keys with null/missing values. When false, output default values.
-    pub sparse: bool,
-}
+pub use crate::pipeline::read::ReadArgs;
+pub use crate::pipeline::write::WriteArgs;
+pub use crate::pipeline::write::WriteJsonArgs;
+pub use crate::pipeline::write::WriteYamlArgs;
 
 /// A `Step` defines a step in the pipeline that can be executed
 /// and has an input and output type.
@@ -179,13 +157,10 @@ pub async fn read_to_batches(
     limit: Option<usize>,
     csv_has_header: Option<bool>,
 ) -> anyhow::Result<Vec<arrow::record_batch::RecordBatch>> {
-    let source = dataframe::read_dataframe(
-        input_path,
-        input_file_type,
-        select.clone(),
-        limit,
-        csv_has_header,
-    )
+    let source = {
+        let select = select.clone();
+        DataFrameReader::new(input_path, input_file_type, select, limit, csv_has_header)
+    }
     .execute(())
     .await?;
     let reader = DataFrameToBatchReader::try_new(source)
 
@@ -216,17 +216,6 @@ impl Step for DataFrameReader {
     }
 }
 
-/// Creates a `DataFrameReader` that reads an input file into a DataFusion DataFrame.
-pub fn read_dataframe(
-    input_path: &str,
-    input_file_type: FileType,
-    select: Option<Vec<String>>,
-    limit: Option<usize>,
-    csv_has_header: Option<bool>,
-) -> DataFrameReader {
-    DataFrameReader::new(input_path, input_file_type, select, limit, csv_has_header)
-}
-
 /// Reads an ORC file into record batches (ORC is not natively supported by DataFusion).
 /// Limit is applied via DataFusion after reading.
 fn read_orc_to_batches(path: &str) -> crate::Result<Vec<arrow::record_batch::RecordBatch>> {