Skip to content

Commit 9b1a13c

Browse files
authored
Update get_all_files_paths_under examples to include keep_extensions (#450)
* add keep_extensions param to existing scripts and docs Signed-off-by: Sarah Yurick <sarahyurick@gmail.com> * run black Signed-off-by: Sarah Yurick <sarahyurick@gmail.com> --------- Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
1 parent a46fb87 commit 9b1a13c

File tree

17 files changed

+36
-32
lines changed

17 files changed

+36
-32
lines changed

docs/user-guide/distributeddataclassification.rst

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ Let's see how ``DomainClassifier`` works in a small excerpt taken from ``example
6565
6666
from nemo_curator.classifiers import DomainClassifier
6767
68-
files = get_all_files_paths_under("books_dataset/")
68+
files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
6969
input_dataset = DocumentDataset.read_json(files, backend="cudf")
7070
7171
domain_classifier = DomainClassifier(filter_by=["Games", "Sports"])
@@ -87,7 +87,7 @@ Using the ``MultilingualDomainClassifier`` is very similar to using the ``Domain
8787
8888
from nemo_curator.classifiers import MultilingualDomainClassifier
8989
90-
files = get_all_files_paths_under("japanese_books_dataset/")
90+
files = get_all_files_paths_under("japanese_books_dataset/", keep_extensions="jsonl")
9191
input_dataset = DocumentDataset.read_json(files, backend="cudf")
9292
9393
multilingual_domain_classifier = MultilingualDomainClassifier(
@@ -110,7 +110,7 @@ Here's an example of how to use the ``QualityClassifier``:
110110
111111
from nemo_curator.classifiers import QualityClassifier
112112
113-
files = get_all_files_paths_under("web_documents/")
113+
files = get_all_files_paths_under("web_documents/", keep_extensions="jsonl")
114114
input_dataset = DocumentDataset.read_json(files, backend="cudf")
115115
116116
quality_classifier = QualityClassifier(filter_by=["High", "Medium"])
@@ -138,7 +138,7 @@ NeMo Curator provides an easy way to annotate and filter your data using the saf
138138

139139
.. code-block:: python
140140
141-
files = get_all_files_paths_under("unsafe_documents/")
141+
files = get_all_files_paths_under("unsafe_documents/", keep_extensions="jsonl")
142142
input_dataset = DocumentDataset.read_json(files, backend="cudf")
143143
144144
token = "hf_1234" # Replace with your user access token
@@ -185,7 +185,7 @@ Here is a small example of how to use the ``InstructionDataGuardClassifier``:
185185
186186
# The model expects instruction-response style text data. For example:
187187
# "Instruction: {instruction}. Input: {input_}. Response: {response}."
188-
files = get_all_files_paths_under("instruction_input_response_dataset/")
188+
files = get_all_files_paths_under("instruction_input_response_dataset/", keep_extensions="jsonl")
189189
input_dataset = DocumentDataset.read_json(files, backend="cudf")
190190
191191
token = "hf_1234" # Replace with your user access token
@@ -214,7 +214,7 @@ To use the FineWeb Educational Content Classifier, you can follow this example:
214214
215215
from nemo_curator.classifiers import FineWebEduClassifier
216216
217-
files = get_all_files_paths_under("web_documents/")
217+
files = get_all_files_paths_under("web_documents/", keep_extensions="jsonl")
218218
input_dataset = DocumentDataset.read_json(files, backend="cudf")
219219
220220
edu_classifier = FineWebEduClassifier(
@@ -337,7 +337,7 @@ Let's see how ``ContentTypeClassifier`` works in a small excerpt taken from ``ex
337337
338338
from nemo_curator.classifiers import ContentTypeClassifier
339339
340-
files = get_all_files_paths_under("books_dataset/")
340+
files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
341341
input_dataset = DocumentDataset.read_json(files, backend="cudf")
342342
343343
content_type_classifier = ContentTypeClassifier(filter_by=["Blogs", "News"])
@@ -359,7 +359,7 @@ Here's an example of how to use the ``PromptTaskComplexityClassifier``:
359359
360360
from nemo_curator.classifiers import PromptTaskComplexityClassifier
361361
362-
files = get_all_files_paths_under("my_dataset/")
362+
files = get_all_files_paths_under("my_dataset/", keep_extensions="jsonl")
363363
input_dataset = DocumentDataset.read_json(files, backend="cudf")
364364
365365
classifier = PromptTaskComplexityClassifier()

docs/user-guide/documentdataset.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ You could read, filter the dataset, and write it using the following methods
4343
from nemo_curator.utils.file_utils import get_all_files_paths_under
4444
from nemo_curator.filters import WordCountFilter
4545
46-
files = get_all_files_paths_under("books_dataset/")
46+
files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
4747
books = DocumentDataset.read_json(files, add_filename=True)
4848
4949
filter_step = nc.ScoreFilter(
@@ -58,7 +58,7 @@ You could read, filter the dataset, and write it using the following methods
5858
5959
Let's walk through this code line by line.
6060

61-
* ``files = get_all_files_paths_under("books_dataset/")`` This retrieves a list of all files in the given directory.
61+
* ``files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")`` This retrieves a list of all files in the given directory, then filters the list to include only files ending with ".jsonl".
6262
In our case, this is equivalent to writing
6363

6464
.. code-block:: python

docs/user-guide/qualityfiltering.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Let's examine this small example:
3535
from nemo_curator.utils.file_utils import get_all_files_paths_under
3636
from nemo_curator.filters import WordCountFilter
3737
38-
files = get_all_files_paths_under("books_dataset/")
38+
files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
3939
books = DocumentDataset.read_json(files, add_filename=True)
4040
4141
filter_step = nc.ScoreFilter(

docs/user-guide/sparkother.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,4 +91,4 @@ The following code snippet demonstrates how to read output from a Spark DataFram
9191
stories_dataset = DocumentDataset.read_parquet(processed_files, backend="pandas")
9292
9393
It is worth noting that Spark typically tends to create checksum and other marker files which can vary by Spark distribution,
94-
so it is advisable to ignore them when reading data into a NeMo Curator ``DocumentDataset``.
94+
so it is advisable to ignore them when reading data into a NeMo Curator ``DocumentDataset``.

docs/user-guide/taskdecontamination.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Let's examine this small example:
2828
from nemo_curator.utils.file_utils import get_all_files_paths_under
2929
from nemo_curator.tasks import Winogrande, Squad, TriviaQA,
3030
31-
files = get_all_files_paths_under("books_dataset/")
31+
files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
3232
books = DocumentDataset.read_json(files, add_filename=True)
3333
3434
downstream_tasks = [

examples/classifier_filtering.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828

2929
def load_dataset(input_data_dir):
30-
files = list(get_all_files_paths_under(input_data_dir))
30+
files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl"))
3131
raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True)
3232
dataset = DocumentDataset(raw_data)
3333

examples/identify_languages.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727

2828
def load_dataset(input_data_dir):
29-
files = list(get_all_files_paths_under(input_data_dir))
29+
files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl"))
3030
raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True)
3131
dataset = DocumentDataset(raw_data)
3232

examples/task_decontamination.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444

4545

4646
def load_dataset(input_data_dir):
47-
files = list(get_all_files_paths_under(input_data_dir))
47+
files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl"))
4848
raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True)
4949
dataset = DocumentDataset(raw_data)
5050

nemo_curator/scripts/find_exact_duplicates.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@ def main(args):
5555
if num_files is not None and num_files <= 0:
5656
logger.info(f"Processed {num_files}... quitting")
5757
break
58-
files = get_all_files_paths_under(root=data_path, recurse_subdirectories=False)
59-
files = [f for f in files if f.endswith(".jsonl")]
58+
files = get_all_files_paths_under(
59+
root=data_path, recurse_subdirectories=False, keep_extensions="jsonl"
60+
)
6061
df = read_data(
6162
files[:num_files] if num_files else files,
6263
file_type="jsonl",

nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,9 @@ def main(args):
7070
print(f"Processed {args.num_files}... quitting")
7171
break
7272

73-
files = get_all_files_paths_under(root=data_path, recurse_subdirectories=False)
74-
files = [f for f in files if f.endswith(".jsonl")]
73+
files = get_all_files_paths_under(
74+
root=data_path, recurse_subdirectories=False, keep_extensions="jsonl"
75+
)
7576
df = read_data(
7677
files[:num_files] if num_files else files,
7778
file_type="jsonl",

0 commit comments

Comments
 (0)