feat: Add support for reading whole text files to read_text

plotor · plotor · commit ecaddb910ae7 · 2026-03-06T16:54:31.000+08:00
Signed-off-by: plotor &lt;zhenchao.wang@hotmail.com&gt;
diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi
@@ -324,10 +324,18 @@ class TextSourceConfig:
 
     encoding: str
     skip_blank_lines: bool
+    whole_text: bool
     buffer_size: int | None
     chunk_size: int | None
 
-    def __init__(self, encoding: str, skip_blank_lines: bool, buffer_size: int | None, chunk_size: int | None): ...
+    def __init__(
+        self,
+        encoding: str,
+        skip_blank_lines: bool,
+        whole_text: bool,
+        buffer_size: int | None,
+        chunk_size: int | None,
+    ): ...
 
 class FileFormatConfig:
     """Configuration for parsing a particular file format (Parquet, CSV, JSON)."""
diff --git a/daft/io/_text.py b/daft/io/_text.py
@@ -14,6 +14,7 @@ def read_text(
     *,
     encoding: str = "utf-8",
     skip_blank_lines: bool = True,
+    whole_text: bool = False,
     file_path_column: str | None = None,
     hive_partitioning: bool = False,
     io_config: IOConfig | None = None,
@@ -26,6 +27,10 @@ def read_text(
         path: Path to text file(s). Supports wildcards and remote URLs such as ``s3://`` or ``gs://``.
         encoding: Encoding of the input files, defaults to ``"utf-8"``.
         skip_blank_lines: Whether to skip empty lines (after stripping whitespace). Defaults to ``True``.
+            When ``whole_text=True``, this skips files that are entirely blank.
+        whole_text: Whether to read each file as a single row. Defaults to ``False``.
+            When ``False``, each line in the file becomes a row in the DataFrame.
+            When ``True``, the entire content of each file becomes a single row in the DataFrame.
         file_path_column: Include the source path(s) as a column with this name. Defaults to ``None``.
         hive_partitioning: Whether to infer hive-style partitions from file paths and include them as
             columns in the DataFrame. Defaults to ``False``.
@@ -34,7 +39,8 @@ def read_text(
         _chunk_size: Optional tuning parameter for the underlying streaming reader chunk size (rows).
 
     Returns:
-        DataFrame: A DataFrame with a single ``"text"`` column containing lines from the input files.
+        DataFrame: A DataFrame with a single ``"text"`` column containing lines from the input files
+            (when ``whole_text=False``) or entire file contents (when ``whole_text=True``).
 
     Examples:
         Read a text file from a local path:
@@ -49,6 +55,11 @@ def read_text(
         >>> io_config = IOConfig(s3=S3Config(region="us-west-2", anonymous=True))
         >>> df = daft.read_text("s3://path/to/files-*.txt", io_config=io_config)
         >>> df.show()
+
+        Read multiple small files, each as a single row:
+
+        >>> df = daft.read_text("/path/to/files/*.txt", whole_text=True)
+        >>> df.show()
     """
     if isinstance(path, list) and len(path) == 0:
         raise ValueError("Cannot read DataFrame from empty list of text filepaths")
@@ -57,6 +68,7 @@ def read_text(
     text_config = TextSourceConfig(
         encoding=encoding,
         skip_blank_lines=skip_blank_lines,
+        whole_text=whole_text,
         buffer_size=_buffer_size,
         chunk_size=_chunk_size,
     )
diff --git a/src/common/file-formats/src/file_format_config.rs b/src/common/file-formats/src/file_format_config.rs
@@ -464,6 +464,7 @@ impl_bincode_py_state_serialization!(WarcSourceConfig);
 pub struct TextSourceConfig {
     pub encoding: String,
     pub skip_blank_lines: bool,
+    pub whole_text: bool,
     pub buffer_size: Option<usize>,
     pub chunk_size: Option<usize>,
 }
@@ -477,18 +478,21 @@ impl TextSourceConfig {
     #[pyo3(signature = (
         encoding,
         skip_blank_lines,
+        whole_text=false,
         buffer_size=None,
-        chunk_size=None
+        chunk_size=None,
     ))]
     fn new(
         encoding: String,
         skip_blank_lines: bool,
+        whole_text: bool,
         buffer_size: Option<usize>,
         chunk_size: Option<usize>,
     ) -> PyResult<Self> {
         Ok(Self {
             encoding,
             skip_blank_lines,
+            whole_text,
             buffer_size,
             chunk_size,
         })
@@ -501,6 +505,7 @@ impl TextSourceConfig {
         let mut res = vec![];
         res.push(format!("Encoding = {}", self.encoding));
         res.push(format!("Skip blank lines = {}", self.skip_blank_lines));
+        res.push(format!("Whole text = {}", self.whole_text));
         if let Some(buffer_size) = self.buffer_size {
             res.push(format!("Buffer size = {buffer_size}"));
         }
@@ -511,4 +516,16 @@ impl TextSourceConfig {
     }
 }
 
+impl Default for TextSourceConfig {
+    fn default() -> Self {
+        Self {
+            encoding: "utf-8".to_string(),
+            skip_blank_lines: true,
+            whole_text: false,
+            buffer_size: None,
+            chunk_size: None,
+        }
+    }
+}
+
 impl_bincode_py_state_serialization!(TextSourceConfig);
diff --git a/src/daft-local-execution/src/sources/scan_task.rs b/src/daft-local-execution/src/sources/scan_task.rs
@@ -678,6 +678,7 @@ async fn stream_scan_task(
             let convert_options = TextConvertOptions::new(
                 &cfg.encoding,
                 cfg.skip_blank_lines,
+                cfg.whole_text,
                 Some(schema_of_file),
                 scan_task.pushdowns.limit,
             );
diff --git a/src/daft-text/src/options.rs b/src/daft-text/src/options.rs
@@ -6,6 +6,7 @@ use serde::{Deserialize, Serialize};
 pub struct TextConvertOptions {
     pub encoding: String,
     pub skip_blank_lines: bool,
+    pub whole_text: bool,
     pub schema: Option<SchemaRef>,
     pub limit: Option<usize>,
 }
@@ -15,12 +16,14 @@ impl TextConvertOptions {
     pub fn new(
         encoding: &str,
         skip_blank_lines: bool,
+        whole_text: bool,
         schema: Option<SchemaRef>,
         limit: Option<usize>,
     ) -> Self {
         Self {
             encoding: encoding.to_string(),
             skip_blank_lines,
+            whole_text,
             schema,
             limit,
         }
@@ -29,7 +32,7 @@ impl TextConvertOptions {
 
 impl Default for TextConvertOptions {
     fn default() -> Self {
-        Self::new("utf-8", true, None, None)
+        Self::new("utf-8", true, false, None, None)
     }
 }
 
diff --git a/src/daft-text/src/read.rs b/src/daft-text/src/read.rs
@@ -9,7 +9,7 @@ use daft_recordbatch::RecordBatch;
 use futures::{Stream, StreamExt, stream::BoxStream};
 use tokio::{
     fs::File,
-    io::{AsyncBufRead, AsyncBufReadExt, BufReader},
+    io::{AsyncBufRead, AsyncBufReadExt, AsyncReadExt, BufReader},
 };
 use tokio_util::io::StreamReader;
 
@@ -40,6 +40,19 @@ pub async fn stream_text(
         .clone()
         .unwrap_or_else(|| Arc::new(Schema::new(vec![Field::new("text", DataType::Utf8)])));
 
+    // Check if we're reading the whole file as a single row
+    if convert_options.whole_text {
+        let whole_text_stream =
+            read_into_whole_text_stream(uri, convert_options, read_options, io_client, io_stats)
+                .await?;
+        return Ok(Box::pin(whole_text_stream.map(move |content_res| {
+            let content = content_res?;
+            let array = Utf8Array::from_values("text", std::iter::once(content.as_str()));
+            let series = array.into_series();
+            RecordBatch::new_with_size(schema.clone(), vec![series], 1)
+        })));
+    }
+
     // Build a stream of line chunks
     let line_chunk_stream =
         read_into_line_chunk_stream(uri, convert_options, read_options, io_client, io_stats)
@@ -60,6 +73,51 @@ pub async fn stream_text(
     Ok(Box::pin(table_stream))
 }
 
+async fn read_into_whole_text_stream(
+    uri: String,
+    convert_options: TextConvertOptions,
+    read_options: TextReadOptions,
+    io_client: Arc<IOClient>,
+    io_stats: Option<IOStatsRef>,
+) -> DaftResult<impl Stream<Item = DaftResult<String>> + Send> {
+    let buffer_size = read_options.buffer_size.unwrap_or(8 * 1024 * 1024);
+
+    let reader: Box<dyn AsyncBufRead + Unpin + Send> = match io_client
+        .single_url_get(uri.clone(), None, io_stats)
+        .await?
+    {
+        GetResult::File(file) => Box::new(BufReader::with_capacity(
+            buffer_size,
+            File::open(file.path).await?,
+        )),
+        GetResult::Stream(stream, ..) => Box::new(BufReader::with_capacity(
+            buffer_size,
+            StreamReader::new(stream),
+        )),
+    };
+
+    // If file is compressed, wrap stream in decoding stream.
+    let mut reader: Box<dyn AsyncBufRead + Unpin + Send> = match CompressionCodec::from_uri(&uri) {
+        Some(compression) => Box::new(BufReader::with_capacity(
+            buffer_size,
+            compression.to_decoder(reader),
+        )),
+        None => reader,
+    };
+
+    Ok(try_stream! {
+        let mut content = String::new();
+        reader.read_to_string(&mut content).await?;
+
+        // Apply skip_blank_lines if needed (for whole file, this means skip if entire content is blank)
+        if convert_options.skip_blank_lines && content.trim().is_empty() {
+            return;
+        }
+
+        yield content;
+    })
+}
+
 async fn read_into_line_chunk_stream(
     uri: String,
     convert_options: TextConvertOptions,
diff --git a/tests/io/test_text.py b/tests/io/test_text.py
@@ -153,3 +153,98 @@ def test_read_with_encoding_setting(tmp_path):
 
     with pytest.raises(Exception, match=r"(?i)utf-?8"):
         daft.read_text(str(path)).to_pydict()
+
+
+def test_read_whole_text_from_single_file(tmp_path):
+    path = tmp_path / "sample.txt"
+    path.write_text("hello\nworld\nfoo", encoding="utf-8")
+
+    df = daft.read_text(str(path), whole_text=True)
+    assert df.schema() == Schema.from_pyarrow_schema(pa.schema([("text", pa.string())]))
+    result = df.to_pydict()
+    assert result["text"] == ["hello\nworld\nfoo"]
+
+
+def test_read_whole_text_from_multiple_files(tmp_path):
+    file_a = tmp_path / "a.txt"
+    file_b = tmp_path / "b.txt"
+    file_a.write_text("content of file a\nwith multiple lines", encoding="utf-8")
+    file_b.write_text("content of file b", encoding="utf-8")
+
+    df = daft.read_text([str(file_a), str(file_b)], whole_text=True)
+    result = df.to_pydict()
+    assert len(result["text"]) == 2
+    assert "content of file a\nwith multiple lines" in result["text"]
+    assert "content of file b" in result["text"]
+
+
+def test_read_whole_text_with_path_column(tmp_path):
+    file_a = tmp_path / "a.txt"
+    file_b = tmp_path / "b.txt"
+    file_a.write_text("content a", encoding="utf-8")
+    file_b.write_text("content b", encoding="utf-8")
+
+    df = daft.read_text([str(file_a), str(file_b)], whole_text=True, file_path_column="path")
+    assert df.schema() == Schema.from_pyarrow_schema(pa.schema([("text", pa.string()), ("path", pa.string())]))
+
+    data = df.to_pydict()
+    assert len(data["text"]) == 2
+    assert len(data["path"]) == 2
+
+    rows = {(t, p) for t, p in zip(data["text"], data["path"])}
+    assert rows == {
+        ("content a", f"{tmp_path}/a.txt"),
+        ("content b", f"{tmp_path}/b.txt"),
+    }
+
+
+def test_read_whole_text_from_empty_file(tmp_path):
+    path = tmp_path / "empty.txt"
+    path.write_text("", encoding="utf-8")
+
+    df = daft.read_text(str(path), whole_text=True, skip_blank_lines=False)
+    result = df.to_pydict()
+    assert result["text"] == [""]
+
+    df = daft.read_text(str(path), whole_text=True, skip_blank_lines=True)
+    result = df.to_pydict()
+    assert result["text"] == []
+
+
+def test_read_whole_text_with_glob_patterns(tmp_path):
+    file_a = tmp_path / "a.txt"
+    file_b = tmp_path / "b.txt"
+    file_c = tmp_path / "c.txt"
+    file_d = tmp_path / "d.txt"
+    file_a.write_text("content a1", encoding="utf-8")
+    file_b.write_text("content b1\ncontent b2\t", encoding="utf-8")
+    file_c.write_text("content c1\ncontent c2\ncontent c3\n\t", encoding="utf-8")
+    file_d.write_text("", encoding="utf-8")
+
+    df = daft.read_text(
+        str(tmp_path / "*.txt"),
+        skip_blank_lines=True,
+        whole_text=True,
+        file_path_column="path",
+    )
+    data = df.to_pydict()
+    assert len(data["text"]) == 3
+    assert len(data["path"]) == 3
+
+    file_to_content = {p: t for p, t in zip(data["path"], data["text"])}
+    assert file_to_content[str(file_a)] == "content a1"
+    assert file_to_content[str(file_b)] == "content b1\ncontent b2\t"
+    assert file_to_content[str(file_c)] == "content c1\ncontent c2\ncontent c3\n\t"
+
+
+def test_read_whole_text_with_gzip(tmp_path):
+    def _write_gzip(path: Path, content: bytes) -> None:
+        with gzip.open(path, "wb") as f:
+            f.write(content)
+
+    path = tmp_path / "compressed.txt.gz"
+    _write_gzip(path, b"line1\nline2\nline3")
+
+    df = daft.read_text(str(path), whole_text=True)
+    result = df.to_pydict()
+    assert result["text"] == ["line1\nline2\nline3"]