Eventual-Inc
diff --git a/‎Cargo.lock‎
Lines changed: 37 additions & 19 deletions b/‎Cargo.lock‎
Lines changed: 37 additions & 19 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 4 additions & 1 deletion b/‎Cargo.toml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎daft/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎daft/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎daft/context.py‎
Lines changed: 3 additions & 0 deletions b/‎daft/context.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎daft/daft/__init__.pyi‎
Lines changed: 20 additions & 0 deletions b/‎daft/daft/__init__.pyi‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎daft/io/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎daft/io/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎daft/io/_text.py‎
Lines changed: 77 additions & 0 deletions b/‎daft/io/_text.py‎
Lines changed: 77 additions & 0 deletions
@@ -21,6 +21,7 @@ daft-compression = {path = "src/daft-compression", default-features = false}
 daft-context = {path = "src/daft-context", default-features = false}
 daft-core = {path = "src/daft-core", default-features = false}
 daft-csv = {path = "src/daft-csv", default-features = false}
+daft-text = {path = "src/daft-text", default-features = false}
 daft-dashboard = {path = "src/daft-dashboard", default-features = false}
 daft-distributed = {path = "src/daft-distributed", default-features = false}
 daft-dsl = {path = "src/daft-dsl", default-features = false}
@@ -78,6 +79,7 @@ python = [
   "daft-context/python",
   "daft-core/python",
   "daft-csv/python",
+  "daft-text/python",
   "daft-dashboard/python",
   "daft-distributed/python",
   "daft-dsl/python",
@@ -228,7 +230,8 @@ members = [
   "src/daft-writers",
   "src/hyperloglog",
   "src/parquet2",
-  "src/daft-cli"
+  "src/daft-cli",
+  "src/daft-text"
 ]
 
 [workspace.dependencies]
 
@@ -138,6 +138,7 @@ def refresh_logger() -> None:
     read_json,
     read_parquet,
     read_sql,
+    read_text,
     read_video_frames,
     read_warc,
     read_huggingface,
@@ -259,6 +260,7 @@ def __getattr__(name: str) -> object:
     "read_parquet",
     "read_sql",
     "read_table",
+    "read_text",
     "read_video_frames",
     "read_warc",
     "refresh_logger",
 
@@ -172,6 +172,7 @@ def set_execution_config(
     csv_target_filesize: int | None = None,
     csv_inflation_factor: float | None = None,
     json_inflation_factor: float | None = None,
+    text_inflation_factor: float | None = None,
     shuffle_aggregation_default_partitions: int | None = None,
     partial_aggregation_threshold: int | None = None,
     high_cardinality_aggregation_threshold: float | None = None,
@@ -216,6 +217,7 @@ def set_execution_config(
         csv_target_filesize: Target File Size when writing out CSV Files. Defaults to 512MB
         csv_inflation_factor: Inflation Factor of CSV files (In-Memory-Size / File-Size) ratio. Defaults to 0.5
         json_inflation_factor: Inflation Factor of JSON files (In-Memory-Size / File-Size) ratio. Defaults to 0.25
+        text_inflation_factor: Inflation Factor of Text files (In-Memory-Size / File-Size) ratio. Defaults to 1.0
         shuffle_aggregation_default_partitions: Maximum number of partitions to create when performing aggregations on the Ray Runner. Defaults to 200, unless the number of input partitions is less than 200.
         partial_aggregation_threshold: Threshold for performing partial aggregations on the Native Runner. Defaults to 10000 rows.
         high_cardinality_aggregation_threshold: Threshold selectivity for performing high cardinality aggregations on the Native Runner. Defaults to 0.8.
@@ -253,6 +255,7 @@ def set_execution_config(
             csv_target_filesize=csv_target_filesize,
             csv_inflation_factor=csv_inflation_factor,
             json_inflation_factor=json_inflation_factor,
+            text_inflation_factor=text_inflation_factor,
             shuffle_aggregation_default_partitions=shuffle_aggregation_default_partitions,
             partial_aggregation_threshold=partial_aggregation_threshold,
             high_cardinality_aggregation_threshold=high_cardinality_aggregation_threshold,
 
@@ -319,6 +319,16 @@ class DatabaseSourceConfig:
 
     def __init__(self, sql: str, conn_factory: SQLConnection): ...
 
+class TextSourceConfig:
+    """Configuration of a text data source."""
+
+    encoding: str
+    skip_blank_lines: bool
+    buffer_size: int | None
+    chunk_size: int | None
+
+    def __init__(self, encoding: str, skip_blank_lines: bool, buffer_size: int | None, chunk_size: int | None): ...
+
 class FileFormatConfig:
     """Configuration for parsing a particular file format (Parquet, CSV, JSON)."""
 
@@ -349,6 +359,11 @@ class FileFormatConfig:
         """Create a database file format config."""
         ...
 
+    @staticmethod
+    def from_text_config(config: TextSourceConfig) -> FileFormatConfig:
+        """Create a Text file format config."""
+        ...
+
     def file_format(self) -> FileFormat:
         """Get the file format for this config."""
         ...
@@ -883,6 +898,7 @@ class TosConfig:
     ) -> TosConfig:
         """Replaces values if provided, returning a new TosConfig."""
         ...
+
     @staticmethod
     def from_env() -> TosConfig:
         """Creates a TosConfig, retrieving credentials and configurations from the current environment.
@@ -970,6 +986,7 @@ class CosConfig:
     ) -> CosConfig:
         """Replaces values if provided, returning a new CosConfig."""
         ...
+
     @staticmethod
     def from_env() -> CosConfig:
         """Creates a CosConfig, retrieving credentials and configurations from the current environment.
@@ -2227,6 +2244,7 @@ class PyDaftExecutionConfig:
         csv_target_filesize: int | None = None,
         csv_inflation_factor: float | None = None,
         json_inflation_factor: float | None = None,
+        text_inflation_factor: float | None = None,
         shuffle_aggregation_default_partitions: int | None = None,
         partial_aggregation_threshold: int | None = None,
         high_cardinality_aggregation_threshold: float | None = None,
@@ -2274,6 +2292,8 @@ class PyDaftExecutionConfig:
     @property
     def json_inflation_factor(self) -> float: ...
     @property
+    def text_inflation_factor(self) -> float: ...
+    @property
     def shuffle_aggregation_default_partitions(self) -> int: ...
     @property
     def partial_aggregation_threshold(self) -> int: ...
 
@@ -17,6 +17,7 @@
 )
 from daft.lazy_import import LazyImport
 from daft.io._csv import read_csv
+from daft.io._text import read_text
 from daft.io.delta_lake._deltalake import read_deltalake
 from daft.io.hudi._hudi import read_hudi
 from daft.io.iceberg._iceberg import read_iceberg
@@ -75,6 +76,7 @@ def __getattr__(name: str) -> object:
     "read_mcap",
     "read_parquet",
     "read_sql",
+    "read_text",
     "read_video_frames",
     "read_warc",
 ]
@@ -0,0 +1,77 @@
+# ruff: noqa: I002
+# isort: dont-add-import: from __future__ import annotations
+
+from daft import DataType, context
+from daft.api_annotations import PublicAPI
+from daft.daft import FileFormatConfig, IOConfig, StorageConfig, TextSourceConfig
+from daft.dataframe import DataFrame
+from daft.io.common import get_tabular_files_scan
+
+
+@PublicAPI
+def read_text(
+    path: str | list[str],
+    *,
+    encoding: str = "utf-8",
+    skip_blank_lines: bool = True,
+    file_path_column: str | None = None,
+    hive_partitioning: bool = False,
+    io_config: IOConfig | None = None,
+    _buffer_size: int | None = None,
+    _chunk_size: int | None = None,
+) -> DataFrame:
+    """Creates a DataFrame from line-oriented text file(s).
+
+    Args:
+        path: Path to text file(s). Supports wildcards and remote URLs such as ``s3://`` or ``gs://``.
+        encoding: Encoding of the input files, defaults to ``"utf-8"``.
+        skip_blank_lines: Whether to skip empty lines (after stripping whitespace). Defaults to ``True``.
+        file_path_column: Include the source path(s) as a column with this name. Defaults to ``None``.
+        hive_partitioning: Whether to infer hive-style partitions from file paths and include them as
+            columns in the DataFrame. Defaults to ``False``.
+        io_config: IO configuration for the native downloader.
+        _buffer_size: Optional tuning parameter for the underlying streaming reader buffer size (bytes).
+        _chunk_size: Optional tuning parameter for the underlying streaming reader chunk size (rows).
+
+    Returns:
+        DataFrame: A DataFrame with a single ``"text"`` column containing lines from the input files.
+
+    Examples:
+        Read a text file from a local path:
+
+        >>> import daft
+        >>> df = daft.read_text("/path/to/file.txt")
+        >>> df.show()
+
+        Read a text file from a public S3 bucket:
+
+        >>> from daft.io import S3Config, IOConfig
+        >>> io_config = IOConfig(s3=S3Config(region="us-west-2", anonymous=True))
+        >>> df = daft.read_text("s3://path/to/files-*.txt", io_config=io_config)
+        >>> df.show()
+    """
+    if isinstance(path, list) and len(path) == 0:
+        raise ValueError("Cannot read DataFrame from empty list of text filepaths")
+
+    io_config = context.get_context().daft_planning_config.default_io_config if io_config is None else io_config
+    text_config = TextSourceConfig(
+        encoding=encoding,
+        skip_blank_lines=skip_blank_lines,
+        buffer_size=_buffer_size,
+        chunk_size=_chunk_size,
+    )
+    file_format_config = FileFormatConfig.from_text_config(text_config)
+    storage_config = StorageConfig(True, io_config)
+
+    # Text schema is fixed
+    schema = {"text": DataType.string()}
+    builder = get_tabular_files_scan(
+        path=path,
+        infer_schema=False,
+        schema=schema,
+        file_format_config=file_format_config,
+        storage_config=storage_config,
+        file_path_column=file_path_column,
+        hive_partitioning=hive_partitioning,
+    )
+    return DataFrame(builder)