Update get_all_files_paths_under examples to include keep_extensions (#450)

sarahyurick · web-flow · commit 9b1a13ccefcb · 2025-02-19T14:43:14.000-08:00
* add keep_extensions param to existing scripts and docs

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* run black

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

---------

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;
diff --git a/docs/user-guide/distributeddataclassification.rst b/docs/user-guide/distributeddataclassification.rst
@@ -65,7 +65,7 @@ Let's see how ``DomainClassifier`` works in a small excerpt taken from ``example
 
     from nemo_curator.classifiers import DomainClassifier
 
-    files = get_all_files_paths_under("books_dataset/")
+    files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     domain_classifier = DomainClassifier(filter_by=["Games", "Sports"])
@@ -87,7 +87,7 @@ Using the ``MultilingualDomainClassifier`` is very similar to using the ``Domain
 
     from nemo_curator.classifiers import MultilingualDomainClassifier
 
-    files = get_all_files_paths_under("japanese_books_dataset/")
+    files = get_all_files_paths_under("japanese_books_dataset/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     multilingual_domain_classifier = MultilingualDomainClassifier(
@@ -110,7 +110,7 @@ Here's an example of how to use the ``QualityClassifier``:
 
     from nemo_curator.classifiers import QualityClassifier
 
-    files = get_all_files_paths_under("web_documents/")
+    files = get_all_files_paths_under("web_documents/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     quality_classifier = QualityClassifier(filter_by=["High", "Medium"])
@@ -138,7 +138,7 @@ NeMo Curator provides an easy way to annotate and filter your data using the saf
 
 .. code-block:: python
 
-    files = get_all_files_paths_under("unsafe_documents/")
+    files = get_all_files_paths_under("unsafe_documents/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     token = "hf_1234"  # Replace with your user access token
@@ -185,7 +185,7 @@ Here is a small example of how to use the ``InstructionDataGuardClassifier``:
 
     # The model expects instruction-response style text data. For example:
     # "Instruction: {instruction}. Input: {input_}. Response: {response}."
-    files = get_all_files_paths_under("instruction_input_response_dataset/")
+    files = get_all_files_paths_under("instruction_input_response_dataset/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     token = "hf_1234"  # Replace with your user access token
@@ -214,7 +214,7 @@ To use the FineWeb Educational Content Classifier, you can follow this example:
 
     from nemo_curator.classifiers import FineWebEduClassifier
 
-    files = get_all_files_paths_under("web_documents/")
+    files = get_all_files_paths_under("web_documents/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     edu_classifier = FineWebEduClassifier(
@@ -337,7 +337,7 @@ Let's see how ``ContentTypeClassifier`` works in a small excerpt taken from ``ex
 
     from nemo_curator.classifiers import ContentTypeClassifier
 
-    files = get_all_files_paths_under("books_dataset/")
+    files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     content_type_classifier = ContentTypeClassifier(filter_by=["Blogs", "News"])
@@ -359,7 +359,7 @@ Here's an example of how to use the ``PromptTaskComplexityClassifier``:
 
     from nemo_curator.classifiers import PromptTaskComplexityClassifier
 
-    files = get_all_files_paths_under("my_dataset/")
+    files = get_all_files_paths_under("my_dataset/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     classifier = PromptTaskComplexityClassifier()
diff --git a/docs/user-guide/documentdataset.rst b/docs/user-guide/documentdataset.rst
@@ -43,7 +43,7 @@ You could read, filter the dataset, and write it using the following methods
     from nemo_curator.utils.file_utils import get_all_files_paths_under
     from nemo_curator.filters import WordCountFilter
 
-    files = get_all_files_paths_under("books_dataset/")
+    files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
     books = DocumentDataset.read_json(files, add_filename=True)
 
     filter_step = nc.ScoreFilter(
@@ -58,7 +58,7 @@ You could read, filter the dataset, and write it using the following methods
 
 Let's walk through this code line by line.
 
-* ``files = get_all_files_paths_under("books_dataset/")`` This retrieves a list of all files in the given directory.
+* ``files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")`` This retrieves a list of all files in the given directory, then filters the list to include only files ending with ".jsonl".
   In our case, this is equivalent to writing
 
   .. code-block:: python
diff --git a/docs/user-guide/qualityfiltering.rst b/docs/user-guide/qualityfiltering.rst
@@ -35,7 +35,7 @@ Let's examine this small example:
     from nemo_curator.utils.file_utils import get_all_files_paths_under
     from nemo_curator.filters import WordCountFilter
 
-    files = get_all_files_paths_under("books_dataset/")
+    files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
     books = DocumentDataset.read_json(files, add_filename=True)
 
     filter_step = nc.ScoreFilter(
diff --git a/docs/user-guide/sparkother.rst b/docs/user-guide/sparkother.rst
@@ -91,4 +91,4 @@ The following code snippet demonstrates how to read output from a Spark DataFram
     stories_dataset = DocumentDataset.read_parquet(processed_files, backend="pandas")
 
 It is worth noting that Spark typically tends to create checksum and other marker files which can vary by Spark distribution,
-so it is advisable to ignore them when reading data into a NeMo Curator ``DocumentDataset``.
+so it is advisable to ignore them when reading data into a NeMo Curator ``DocumentDataset``.
diff --git a/docs/user-guide/taskdecontamination.rst b/docs/user-guide/taskdecontamination.rst
@@ -28,7 +28,7 @@ Let's examine this small example:
     from nemo_curator.utils.file_utils import get_all_files_paths_under
     from nemo_curator.tasks import Winogrande, Squad, TriviaQA,
 
-    files = get_all_files_paths_under("books_dataset/")
+    files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
     books = DocumentDataset.read_json(files, add_filename=True)
 
     downstream_tasks = [
diff --git a/examples/classifier_filtering.py b/examples/classifier_filtering.py
@@ -27,7 +27,7 @@
 
 
 def load_dataset(input_data_dir):
-    files = list(get_all_files_paths_under(input_data_dir))
+    files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl"))
     raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True)
     dataset = DocumentDataset(raw_data)
 
diff --git a/examples/identify_languages.py b/examples/identify_languages.py
@@ -26,7 +26,7 @@
 
 
 def load_dataset(input_data_dir):
-    files = list(get_all_files_paths_under(input_data_dir))
+    files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl"))
     raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True)
     dataset = DocumentDataset(raw_data)
 
diff --git a/examples/task_decontamination.py b/examples/task_decontamination.py
@@ -44,7 +44,7 @@
 
 
 def load_dataset(input_data_dir):
-    files = list(get_all_files_paths_under(input_data_dir))
+    files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl"))
     raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True)
     dataset = DocumentDataset(raw_data)
 
diff --git a/nemo_curator/scripts/find_exact_duplicates.py b/nemo_curator/scripts/find_exact_duplicates.py
@@ -55,8 +55,9 @@ def main(args):
         if num_files is not None and num_files <= 0:
             logger.info(f"Processed {num_files}... quitting")
             break
-        files = get_all_files_paths_under(root=data_path, recurse_subdirectories=False)
-        files = [f for f in files if f.endswith(".jsonl")]
+        files = get_all_files_paths_under(
+            root=data_path, recurse_subdirectories=False, keep_extensions="jsonl"
+        )
         df = read_data(
             files[:num_files] if num_files else files,
             file_type="jsonl",
diff --git a/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
@@ -70,8 +70,9 @@ def main(args):
             print(f"Processed {args.num_files}... quitting")
             break
 
-        files = get_all_files_paths_under(root=data_path, recurse_subdirectories=False)
-        files = [f for f in files if f.endswith(".jsonl")]
+        files = get_all_files_paths_under(
+            root=data_path, recurse_subdirectories=False, keep_extensions="jsonl"
+        )
         df = read_data(
             files[:num_files] if num_files else files,
             file_type="jsonl",
diff --git a/nemo_curator/scripts/prepare_fasttext_training_data.py b/nemo_curator/scripts/prepare_fasttext_training_data.py
@@ -32,7 +32,9 @@ def sample_rows(df, n, seed):
 def main(args):
     client = get_client(**ArgumentHelper.parse_client_args(args))
     # Get local path
-    files = list(get_all_files_paths_under(args.input_data_dir))
+    files = list(
+        get_all_files_paths_under(args.input_data_dir, keep_extensions="jsonl")
+    )
     raw_data = read_data(files, file_type="jsonl", backend="pandas")
     dataset = DocumentDataset(raw_data)
     text_field = args.input_json_field
diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py
@@ -452,7 +452,7 @@ def reshard_jsonl(
     # Output file size in bytes
     blocksize = parse_str_of_num_bytes(output_file_size)
 
-    input_files = list(get_all_files_paths_under(input_dir))
+    input_files = list(get_all_files_paths_under(input_dir, keep_extensions="jsonl"))
 
     # Read in the dask bag
     b = db.read_text(input_files, blocksize=blocksize)
diff --git a/tests/test_read_data.py b/tests/test_read_data.py
@@ -9,7 +9,6 @@
     read_data_blocksize,
     read_data_files_per_partition,
 )
-from nemo_curator.utils.file_utils import get_all_files_paths_under
 
 NUM_FILES = 5
 NUM_RECORDS = 100
diff --git a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb
@@ -1457,8 +1457,9 @@
     }
    ],
    "source": [
-    "files = get_all_files_paths_under(root=input_data_dir, recurse_subdirectories=False)\n",
-    "files = [f for f in files if f.endswith(\".jsonl\")]\n",
+    "files = get_all_files_paths_under(\n",
+    "    root=input_data_dir, recurse_subdirectories=False, keep_extensions=\"jsonl\"\n",
+    ")\n",
     "df = read_data(\n",
     "    files,\n",
     "    file_type=\"jsonl\",\n",
diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
@@ -1339,8 +1339,9 @@
     "print(f\"Computing minhashes for {minhash_data_path}\")\n",
     "\n",
     "# Load data. Only the [minhash_id_field, text_field] columns are needed\n",
-    "files = get_all_files_paths_under(root=minhash_data_path, recurse_subdirectories=False)\n",
-    "files = [f for f in files if f.endswith(\".jsonl\")]\n",
+    "files = get_all_files_paths_under(\n",
+    "    root=minhash_data_path, recurse_subdirectories=False, keep_extensions=\"jsonl\"\n",
+    ")\n",
     "df = read_data(\n",
     "    files,\n",
     "    file_type=\"jsonl\",\n",
diff --git a/tutorials/tinystories/main.py b/tutorials/tinystories/main.py
@@ -176,9 +176,9 @@ def run_curation_pipeline(args: Any, jsonl_dir: str) -> None:
     client = get_client(**ArgumentHelper.parse_client_args(args))
     print(f"Running curation pipeline on '{jsonl_dir}'...")
     files = [
-        fp
-        for fp in get_all_files_paths_under(jsonl_dir, recurse_subdirectories=False)
-        if fp.endswith(".jsonl")
+        get_all_files_paths_under(
+            jsonl_dir, recurse_subdirectories=False, keep_extensions="jsonl"
+        )
     ]
     print("Reading the data...")
     orig_dataset = DocumentDataset.read_json(files, add_filename=True)
diff --git a/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py b/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py
@@ -13,8 +13,7 @@
 
 
 def read_folder(input_folder, columns=["nemo_id", "text"]):
-    data_paths = get_all_files_paths_under(input_folder)
-    data_paths = [f for f in data_paths if f.endswith(".parquet")]
+    data_paths = get_all_files_paths_under(input_folder, keep_extensions="parquet")
     data_paths.sort()
     logging.info(f"Number of files being read: {len(data_paths)}")
     text_ddf = dask_cudf.read_parquet(