fixed github issue 677 (#682) (#687)

ayushdg · vinay-raman · web-flow · commit 33db56e304a3 · 2025-04-25T15:10:03.000-07:00
* fixed github issue 677



* added docs for hard-negative-mining



* minor changes



* fixed multi-gpu error



---------

Signed-off-by: viraman &lt;viraman@nvidia.com&gt;
Signed-off-by: Ayush Dattagupta &lt;ayushdg95@gmail.com&gt;
Co-authored-by: vinay-raman &lt;98057837+vinay-raman@users.noreply.github.com&gt;
diff --git a/tutorials/nemo-retriever-synthetic-data-generation/README.md b/tutorials/nemo-retriever-synthetic-data-generation/README.md
@@ -109,3 +109,21 @@ The choice of the embedding model is provided in the default configuration. We e
 For Answerability Filter, our recommendation is to go with the choice provided in the default configuation file. We confirmed that the checkbox-style prompt in the default configuration worked well for valid question filtering.
 
 However, the framework is flexible of the choice of LLM-as-a-Judge and different LLMs with different prompt templates might work better for certain use cases. You can also experiment with Likert-scale prompting if need be.
+
+## Hard Negative Mining:
+Hard-negative mining involves two steps. First step is to repartition the dataset into semantically similar documents. This is done using the following script,
+```
+python tutorials/nemo-retriever-synthetic-data-generation/repartition.py \
+  --api-key=<API Key> \
+  --input-dir=tutorials/nemo-retriever-synthetic-data-generation/sample_data/hard-neg-mining\
+  --hard-negative-mining-config=tutorials/nemo-retriever-synthetic-data-generation/config/hard-negative-mining-config.yaml
+  --output-dir=tutorials/nemo-retriever-synthetic-data-generation/my_clustered_dataset_dir
+```
+Once, the semantic clusters have been created, one can perform the hard negative mining as follows,
+```
+python tutorials/nemo-retriever-synthetic-data-generation/mine_hard_negatives.py \
+  --api-key=<API Key> \
+  --input-dir=tutorials/nemo-retriever-synthetic-data-generation/my_clustered_dataset_dir\
+  --hard-negative-mining-config=tutorials/nemo-retriever-synthetic-data-generation/config/hard-negative-mining-config.yaml
+  --output-dir=tutorials/nemo-retriever-synthetic-data-generation/my_mined_dataset_dir
+```
diff --git a/tutorials/nemo-retriever-synthetic-data-generation/mine_hard_negatives.py b/tutorials/nemo-retriever-synthetic-data-generation/mine_hard_negatives.py
@@ -36,7 +36,7 @@ def main():
         "--input-dir",
         type=str,
         default="",
-        help="Input dir path containing annotated data files in jsonl format",
+        help="Input dir path containing annotated data files in jsonl format (with extension .part)",
     )
     parser.add_argument(
         "--hard-negative-mining-config",
diff --git a/tutorials/nemo-retriever-synthetic-data-generation/retriever_hardnegative_miner.py b/tutorials/nemo-retriever-synthetic-data-generation/retriever_hardnegative_miner.py
@@ -14,12 +14,12 @@
 
 import importlib
 import itertools
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
 from dask.base import normalize_token
 from openai import OpenAI
-from sentence_transformers import SentenceTransformer
 
 from nemo_curator import ClusteringModel
 from nemo_curator.datasets import DocumentDataset
@@ -30,13 +30,18 @@
 )
 RetrieverHardNegativeMiningConfig = config.RetrieverHardNegativeMiningConfig
 
+if TYPE_CHECKING:
+    from sentence_transformers import SentenceTransformer
+
 
 def create_nim_client(base_url, api_key):
     openai_client = OpenAI(base_url=base_url, api_key=api_key)
     return openai_client
 
 
-def create_hf_model(model_name_or_path):
+def create_hf_model(model_name_or_path: str) -> "SentenceTransformer":
+    from sentence_transformers import SentenceTransformer
+
     return SentenceTransformer(model_name_or_path, trust_remote_code=True)
 
 
@@ -167,7 +172,6 @@ def _groupby_question(self, pdf):
     def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
 
         df = dataset.df
-        df = df.to_backend("pandas")
         df = df[["question", "documents"]]
         df = df.map_partitions(self._groupby_question).reset_index()
         print("Number partitions in dataset = {}".format(df.npartitions))

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def main():`
`36`	`36`	`"--input-dir",`
`37`	`37`	`type=str,`
`38`	`38`	`default="",`
`39`		`- help="Input dir path containing annotated data files in jsonl format",`
	`39`	`+ help="Input dir path containing annotated data files in jsonl format (with extension .part)",`
`40`	`40`	`)`
`41`	`41`	`parser.add_argument(`
`42`	`42`	`"--hard-negative-mining-config",`