Skip to content

Commit fa3bc37

Browse files
Using DocumentDataset after removal in tutorials / examples (#724)
1 parent 28c1086 commit fa3bc37

File tree

5 files changed

+9
-11
lines changed

5 files changed

+9
-11
lines changed

examples/exact_deduplication.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from nemo_curator.datasets import DocumentDataset
1919
from nemo_curator.modules import ExactDuplicates
20-
from nemo_curator.utils.distributed_utils import get_client, write_to_disk
20+
from nemo_curator.utils.distributed_utils import get_client
2121
from nemo_curator.utils.script_utils import ArgumentHelper
2222

2323

@@ -60,7 +60,7 @@ def main(args: argparse.Namespace) -> None:
6060
duplicates = DocumentDataset.read_parquet(duplicates, backend=backend)
6161

6262
result = exact_dup.remove(input_dataset, duplicates)
63-
write_to_disk(result, output_dir, output_type="parquet")
63+
result.to_parquet(output_dir)
6464
print(time.time() - t0)
6565

6666

examples/fuzzy_deduplication.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig
2121
from nemo_curator.datasets import DocumentDataset
22-
from nemo_curator.utils.distributed_utils import get_client, write_to_disk
22+
from nemo_curator.utils.distributed_utils import get_client
2323
from nemo_curator.utils.script_utils import ArgumentHelper
2424

2525

@@ -92,7 +92,7 @@ def main(args: argparse.Namespace) -> None:
9292
return
9393

9494
result = fuzzy_dup.remove(input_dataset, duplicates)
95-
write_to_disk(result, output_dir, output_type=filetype)
95+
result.to_parquet(output_dir)
9696
print(f"Time taken:{time.time() - t0}s")
9797

9898

tutorials/dapt-curation/code/utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,7 @@ def exact_dedupe(dataset: DocumentDataset) -> DocumentDataset:
274274
deduplicator = ExactDuplicates(id_field="id", text_field="text", hash_method="md5")
275275
# Find the duplicates
276276
duplicates = deduplicator(dataset)
277-
deduped = deduplicator.remove(dataset, duplicates)
278-
return DocumentDataset(deduped)
277+
return deduplicator.remove(dataset, duplicates)
279278

280279

281280
def fuzzy_dedupe(dataset: DocumentDataset, cache_dir: str) -> DocumentDataset:

tutorials/multimodal_dapt_curation/curator/utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,7 @@ def exact_dedupe(dataset: DocumentDataset) -> DocumentDataset:
162162
deduplicator = ExactDuplicates(id_field="id", text_field="text", hash_method="md5")
163163
# Find the duplicates
164164
duplicates = deduplicator(dataset)
165-
deduped = deduplicator.remove(dataset, duplicates)
166-
return DocumentDataset(deduped)
165+
return deduplicator.remove(dataset, duplicates)
167166

168167

169168
def fuzzy_dedupe(dataset: DocumentDataset, cache: str) -> DocumentDataset:

tutorials/tinystories/main.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,7 @@ def dedupe(dataset: DocumentDataset) -> DocumentDataset:
152152
deduplicator = ExactDuplicates(id_field="id", text_field="text", hash_method="md5")
153153
# Find the duplicates
154154
duplicates = deduplicator(dataset)
155-
deduped = deduplicator.remove(dataset, duplicates)
156-
return DocumentDataset(deduped)
155+
return deduplicator.remove(dataset, duplicates)
157156

158157

159158
def run_curation_pipeline(args: argparse.Namespace, jsonl_dir: str) -> None:
@@ -173,7 +172,8 @@ def run_curation_pipeline(args: argparse.Namespace, jsonl_dir: str) -> None:
173172
keep_extensions="jsonl",
174173
)
175174
print("Reading the data...")
176-
orig_dataset = DocumentDataset.read_json(files, add_filename=True)
175+
# We don't read with add_filename because it already exists in the jsonl files.
176+
orig_dataset = DocumentDataset.read_json(files)
177177
dataset = orig_dataset
178178

179179
curation_steps = Sequential(

0 commit comments

Comments
 (0)