Skip to content

Commit a6764d8

Browse files
Semantic Dedup Tutorial + bug fixes (#1067)
* bug fixes Signed-off-by: Praateek <praateekm@gmail.com> * add notebooks Signed-off-by: Praateek <praateekm@gmail.com> * change input path Signed-off-by: Praateek <praateekm@gmail.com> * add comment about input filetype Signed-off-by: Praateek <praateekm@gmail.com> * add download dataset too Signed-off-by: Praateek <praateekm@gmail.com> * pr comments Signed-off-by: Praateek <praateekm@gmail.com> * json -> jsonl Signed-off-by: Praateek <praateekm@gmail.com> * fc Signed-off-by: Praateek <praateekm@gmail.com> * pr comments Signed-off-by: Praateek <praateekm@gmail.com> * .. Signed-off-by: Praateek <praateekm@gmail.com> * change graph Signed-off-by: Praateek <praateekm@gmail.com> * pr reveiw Signed-off-by: Praateek <praateekm@gmail.com> * .. Signed-off-by: Praateek <praateekm@gmail.com> --------- Signed-off-by: Praateek <praateekm@gmail.com>
1 parent ea599ae commit a6764d8

File tree

4 files changed

+2177
-3
lines changed

4 files changed

+2177
-3
lines changed

nemo_curator/stages/deduplication/semantic/workflow.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ def run(self, pairwise_executor: BaseExecutor | None = None) -> dict[str, Any]:
380380
if total_duplicates > 0:
381381
logger.success(f"Total documents identified as duplicates: {total_duplicates}")
382382
logger.info(f"Similarity threshold used: {1.0 - self.eps:.3f} (eps={self.eps})")
383-
else:
383+
elif self.eps is not None:
384384
logger.info(
385385
f"No duplicates identified with similarity threshold of {1.0 - self.eps:.3f} (eps={self.eps})"
386386
)
@@ -396,5 +396,5 @@ def run(self, pairwise_executor: BaseExecutor | None = None) -> dict[str, Any]:
396396
"pairwise_execution_time": pairwise_time,
397397
"kmeans_results": kmeans_results,
398398
"pairwise_results": pairwise_results,
399-
"total_duplicates_identified": total_duplicates,
399+
**({"total_duplicates_identified": total_duplicates} if self.eps is not None else {}),
400400
}

nemo_curator/stages/text/io/reader/base.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,12 @@ def inputs(self) -> tuple[list[str], list[str]]:
5252
return [], []
5353

5454
def outputs(self) -> tuple[list[str], list[str]]:
55-
return ["data"], self.fields or []
55+
output_fields = self.fields or []
56+
if self._generate_ids or self._assign_ids:
57+
from nemo_curator.stages.deduplication.id_generator import CURATOR_DEDUP_ID_STR
58+
59+
output_fields.append(CURATOR_DEDUP_ID_STR)
60+
return ["data"], output_fields
5661

5762
def setup(self, _: WorkerMetadata | None = None) -> None:
5863
if self._generate_ids or self._assign_ids:

0 commit comments

Comments
 (0)