NVIDIA-NeMo
diff --git a/‎nemo_curator/filters/synthetic.py‎
Lines changed: 32 additions & 19 deletions b/‎nemo_curator/filters/synthetic.py‎
Lines changed: 32 additions & 19 deletions
diff --git a/‎tutorials/nemo-retriever-synthetic-data-generation/README.md‎
Lines changed: 21 additions & 8 deletions b/‎tutorials/nemo-retriever-synthetic-data-generation/README.md‎
Lines changed: 21 additions & 8 deletions
diff --git a/‎tutorials/nemo-retriever-synthetic-data-generation/config/config-fiqa.yaml‎
Lines changed: 2 additions & 2 deletions b/‎tutorials/nemo-retriever-synthetic-data-generation/config/config-fiqa.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tutorials/nemo-retriever-synthetic-data-generation/config/config-nq.yaml‎
Lines changed: 1 addition & 1 deletion b/‎tutorials/nemo-retriever-synthetic-data-generation/config/config-nq.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml‎
Lines changed: 1 addition & 1 deletion
@@ -24,6 +24,15 @@
 
 from nemo_curator.filters.doc_filter import DocumentFilter
 from nemo_curator.utils.decorators import batched
+from nemo_curator.utils.distributed_utils import NoWorkerError, load_object_on_worker
+
+
+def create_client(base_url, api_key):
+    openai_client = OpenAI(
+        base_url=base_url,
+        api_key=api_key,
+    )
+    return openai_client
 
 
 # ----------------------------------------------------------------------------80
@@ -52,16 +61,21 @@ def __init__(
         self.percentile = percentile
         if truncate:
             self.truncate = truncate
-        try:
-            self.client = OpenAI(base_url=self.base_url, api_key=self.api_key)
-        except Exception as e:
-            print(f"Error accessing NIM model: {e}")
         self.batch_size = batch_size
         self.text_fields = text_fields
 
     @batched
     def score_document(self, df: pd.DataFrame):
 
+        try:
+            self.client = load_object_on_worker(
+                attr="openai_client_easiness",
+                load_object_function=create_client,
+                load_object_kwargs={"base_url": self.base_url, "api_key": self.api_key},
+            )
+        except NoWorkerError:
+            return pd.Series(np.ones(len(df)), dtype=float)
+
         document_score = self._calc_similarity_nim(
             df[self.text_fields[0]].to_list(), df[self.text_fields[1]].to_list()
         )
@@ -90,7 +104,7 @@ def _get_nim_embedding(self, text, input_type):
             print(f"Error: {e}")
             response = None
 
-        if response:
+        if response and not isinstance(response, str):
             if isinstance(text, list):
                 embeddings = [r.embedding for r in response.data]
             elif isinstance(text, str):
@@ -116,9 +130,6 @@ def _calc_similarity_nim(self, context, question):
 
         return sim
 
-    def __dask_tokenize__(self):
-        return normalize_token(EasinessFilter)
-
 
 # ----------------------------------------------------------------------------80
 # ----------------------- Answerability Filter ---------------------------------
@@ -149,19 +160,24 @@ def __init__(
         self.system_prompt = answerability_system_prompt
         self.user_prompt_template = answerability_user_prompt_template
         self.num_criteria = num_criteria
-
-        try:
-            self.client = OpenAI(base_url=self.base_url, api_key=self.api_key)
-        except Exception as e:
-            print(f"Error accessing NIM model: {e}")
-
         self.text_fields = text_fields
 
     @batched
     def score_document(self, df: pd.DataFrame):
-        return df.apply(
+
+        try:
+            self.client = load_object_on_worker(
+                attr="openai_client_answerability",
+                load_object_function=create_client,
+                load_object_kwargs={"base_url": self.base_url, "api_key": self.api_key},
+            )
+        except NoWorkerError:
+            return pd.Series(["string"] * len(df))
+
+        return df.progress_apply(
             lambda row: self._llm_as_judge(
-                row[self.text_fields[0]], row[self.text_fields[1]]
+                row[self.text_fields[0]],
+                row[self.text_fields[1]],
             ),
             axis=1,
         )
@@ -212,8 +228,5 @@ def _llm_as_judge(self, context: str, question: str):
 
         return generation
 
-    def __dask_tokenize__(self):
-        return normalize_token(AnswerabilityFilter)
-
 
 # ----------------------------------------------------------------------------80
@@ -45,22 +45,35 @@ Navigate to the [quick start notebook](notebooks/quickstart.ipynb) and follow th
 
 ### Run Pipeline (CLI)
 
-The pipeline can be run with datasets in rawdoc (only text, title and ids if any) format. To test the pipeline, you can use the provided example data at ```sample_data_rawdoc.jsonl```
+The pipeline can be run with datasets in ```jsonl``` (only text, title and ids if any) format. To test the pipeline, you can use the provided example data at ```sample_data/sample_data_rawdoc.jsonl```
 
-Navigate to the top level of this project directory and run the following command in your command line. It will take roughly 5-10 minutes.
+To use jsonl format, provide your data in a single or multiple `.jsonl` files. The structure of the data should follow this format: `{"text": <document>, "title": <title>}`. Additionally, if the documents already have a document id, the input file can also contain document ids. The same ids will be persisted in the generated data as well. Another accepted format is `{"_id": <document_id>, "text": <document>, "title": <title>}`.
 
-- `Rawdoc format`
-
-To use rawdoc format, provide your data in a `.jsonl` file. The structure of the data should follow this format: `{"text": <document>, "title": <title>}`. Additionally, if the documents already have a document id, the input file can also contain document ids. The same ids will be persisted in the generated data as well. Another accepted format is `{"_id": <document_id>, "text": <document>, "title": <title>}`.
+The pipeline can be run in two modes (1. Generation and 2. Filtering). In order to run the full pipeline in generation mode, use the script ```main.py``` with the flag ```--pipeline-type=generate```
+```
+python tutorials/nemo-retriever-synthetic-data-generation/main.py \
+  --api-key=<API Key> \
+  --input-dir=tutorials/nemo-retriever-synthetic-data-generation/sample_data \
+  --pipeline-config=tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml\
+  --input-format=jsonl \
+  --pipeline-type=generate \
+  --output-dir=tutorials/nemo-retriever-synthetic-data-generation/outputs/sample_data_rawdoc
+  --save-format=jsonl
+  --n-partitions=5
+```
+The data can be saved in two formats (1. jsonl, 2. beir). Additionally, the user can pass ```--n-partitions``` flag to speed-up generation for large datasets.
 
-In order to run the pipeline, use the script ```main.py```
+To filter pre-generated data, run ```main.py``` with ```--pipeline-type=filter```
+Note the change in the ```input-dir```, we need to use the path to the generated data in jsonl format.
 ```
 python tutorials/nemo-retriever-synthetic-data-generation/main.py \
   --api-key=<API Key> \
-  --input-file=tutorials/nemo-retriever-synthetic-data-generation/data/sample_data_rawdoc.jsonl \
+  --input-dir= tutorials/nemo-retriever-synthetic-data-generation/outputs/sample_data_rawdoc/jsonl \
   --pipeline-config=tutorials/nemo-retriever-synthetic-data-generation/config/config.yaml\
-  --input-format=rawdoc \
+  --input-format=jsonl \
+  --pipeline-type=filter \
   --output-dir=tutorials/nemo-retriever-synthetic-data-generation/outputs/sample_data_rawdoc
+  --save-format=jsonl
 ```
 
 For more information about the expected structure of the data, see the [quick start notebook](notebooks/quickstart.ipynb).
 
@@ -55,7 +55,7 @@ generator_system_prompt: |
       Do I need a new EIN since I am hiring employees for my LLC?
 
     user_prompt_template: |
-      Generate {num_questions} questions and corresponding answers based on Input Document.
+      Generate {n_openlines} questions and corresponding answers based on Input Document.
 
       Input Document:
       {document}
@@ -72,7 +72,7 @@ percentile: 70  # Percentile for threshold calculation (float) [0, 100]
 batch_size: 1
 
 #Answerability filter (LLM-as-judge)
-answerability_filter: "meta/llama3-70b-instruct"
+answerability_filter: "meta/llama-3.1-70b-instruct"
 num_criteria: 4  # Number of criteria to parse from the response. It must be alined with the prompt template
 answerability_system_prompt: |
       You are an evaluator who is rating questions to given context passages based on the given criteria. Assess the given question for clarity and answerability given enough domain knowledge, consider the following evaluation criterion:
 
@@ -72,7 +72,7 @@ percentile: 70  # Percentile for threshold calculation (float) [0, 100]
 batch_size: 1
 
 #Answerability filter (LLM-as-judge)
-answerability_filter: "meta/llama3-70b-instruct"
+answerability_filter: "meta/llama-3.1-70b-instruct"
 num_criteria: 4  # Number of criteria to parse from the response. It must be alined with the prompt template
 answerability_system_prompt: |
       You are an evaluator who is rating questions to given context passages based on the given criteria. Assess the given question for clarity and answerability given enough domain knowledge, consider the following evaluation criterion:
 
@@ -63,7 +63,7 @@ percentile: 70  # Percentile for threshold calculation (float) [0, 100]
 batch_size: 1
 
 #Answerability filter (LLM-as-judge)
-answerability_filter: "meta/llama3-70b-instruct"
+answerability_filter: "meta/llama-3.1-70b-instruct"
 num_criteria: 4  # Number of criteria to parse from the response. It must be alined with the prompt template
 answerability_system_prompt: |
       You are an evaluator who is rating questions to given context passages based on the given criteria. Assess the given question for clarity and answerability given enough domain knowledge, consider the following evaluation criterion: