Skip to content

Commit c170b5b

Browse files
Feat/add read nums (#144)
* feat: add param read_nums * feat: add param read_nums
1 parent 56fd51d commit c170b5b

File tree

1 file changed

+5
-0
lines changed

1 file changed

+5
-0
lines changed

graphgen/operators/read/read.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def read(
5353
working_dir: Optional[str] = "cache",
5454
parallelism: int = 4,
5555
recursive: bool = True,
56+
read_nums: Optional[int] = None,
5657
**reader_kwargs: Any,
5758
) -> ray.data.Dataset:
5859
"""
@@ -63,6 +64,7 @@ def read(
6364
:param working_dir: Directory to cache intermediate files (PDF processing)
6465
:param parallelism: Number of parallel workers
6566
:param recursive: Whether to scan directories recursively
67+
:param read_nums: Limit the number of documents to read
6668
:param reader_kwargs: Additional kwargs passed to readers
6769
:return: Ray Dataset containing all documents
6870
"""
@@ -120,6 +122,9 @@ def read(
120122
}
121123
)
122124

125+
if read_nums is not None:
126+
combined_ds = combined_ds.limit(read_nums)
127+
123128
logger.info("[READ] Successfully read files from %s", input_path)
124129
return combined_ds
125130

0 commit comments

Comments
 (0)