🧳 Move zen generation script and fix tests (huggingface#2393)

qgallouedec · kashif · web-flow · commit 43df3a485a6d · 2024-11-26T14:08:06.000+01:00
* Move zen

* step -&gt; stepwise_supervision

* Fix train_test_split shuffle issue

* Fix tests

* Update tests/test_sft_trainer.py

Co-authored-by: Kashif Rasul &lt;kashif.rasul@gmail.com&gt;

* Fix typo in key name

---------

Co-authored-by: Kashif Rasul &lt;kashif.rasul@gmail.com&gt;
diff --git a/scripts/generate_zen_dataset.py b/scripts/generate_zen_dataset.py
@@ -28,13 +28,13 @@ class ScriptArguments:
             Fraction of the dataset to include in the test split.
         push_to_hub (`bool`, *optional*, defaults to `False`):
             Whether to push the dataset to the Hugging Face Hub.
-        repo_id (`str`, *optional*, defaults to `"trl-lib/zen"`):
+        repo_id (`str`, *optional*, defaults to `"trl-internal-testing/zen"`):
             Hugging Face repository ID to push the dataset to.
     """
 
     test_size: float = 0.1
     push_to_hub: bool = False
-    repo_id: str = "trl-lib/zen"
+    repo_id: str = "trl-internal-testing/zen"
 
 
 def main(test_size, push_to_hub, repo_id):
@@ -62,7 +62,7 @@ def main(test_size, push_to_hub, repo_id):
             "Namespaces are one honking great idea -- let's do more of those!",
         ],
     })
-    standard_language_modeling_dataset = standard_language_modeling_dataset.train_test_split(test_size=test_size)
+    standard_language_modeling_dataset = standard_language_modeling_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_language_modeling_dataset.push_to_hub(repo_id, config_name="standard_language_modeling")
 
@@ -89,7 +89,7 @@ def main(test_size, push_to_hub, repo_id):
             "Namespaces are one honking great",
         ],
     })
-    standard_prompt_only_dataset = standard_prompt_only_dataset.train_test_split(test_size=test_size)
+    standard_prompt_only_dataset = standard_prompt_only_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_prompt_only_dataset.push_to_hub(repo_id, config_name="standard_prompt_only")
 
@@ -137,7 +137,7 @@ def main(test_size, push_to_hub, repo_id):
             " idea -- let's do more of those!",
         ],
     })
-    standard_prompt_completion_dataset = standard_prompt_completion_dataset.train_test_split(test_size=test_size)
+    standard_prompt_completion_dataset = standard_prompt_completion_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_prompt_completion_dataset.push_to_hub(repo_id, config_name="standard_prompt_completion")
 
@@ -206,7 +206,7 @@ def main(test_size, push_to_hub, repo_id):
             " watermelon -- let's plant some!",
         ],
     })
-    standard_preference_dataset = standard_preference_dataset.train_test_split(test_size=test_size)
+    standard_preference_dataset = standard_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_preference_dataset.push_to_hub(repo_id, config_name="standard_preference")
 
@@ -254,7 +254,7 @@ def main(test_size, push_to_hub, repo_id):
             "Namespaces are one honking great watermelon -- let's plant some!",
         ],
     })
-    standard_implicit_prompt_preference_dataset = standard_implicit_prompt_preference_dataset.train_test_split(test_size=test_size)
+    standard_implicit_prompt_preference_dataset = standard_implicit_prompt_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_implicit_prompt_preference_dataset.push_to_hub(repo_id, config_name="standard_implicit_prompt_preference")
 
@@ -303,11 +303,11 @@ def main(test_size, push_to_hub, repo_id):
         ],
         "label": [True, False, False, True, True, False, True, False, True, True, False, True, True, False, True, False, True, False, False],
     })
-    standard_unpaired_preference_dataset = standard_unpaired_preference_dataset.train_test_split(test_size=test_size)
+    standard_unpaired_preference_dataset = standard_unpaired_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_unpaired_preference_dataset.push_to_hub(repo_id, config_name="standard_unpaired_preference")
 
-    standard_step_dataset = Dataset.from_dict({
+    standard_stepwise_supervision_dataset = Dataset.from_dict({
         "prompt": [
             "Beautiful is better than",
             "Explicit is better than",
@@ -350,7 +350,7 @@ def main(test_size, push_to_hub, repo_id):
             [" of those great ideas,", " that solve many problems."],
             [" the code should still aim for balance."],
         ],
-        "label": [
+        "labels": [
             [False, True],
             [False, True, False],
             [False, True],
@@ -371,9 +371,9 @@ def main(test_size, push_to_hub, repo_id):
             [False]
         ]
     })
-    standard_step_dataset = standard_step_dataset.train_test_split(test_size=test_size)
+    standard_stepwise_supervision_dataset = standard_stepwise_supervision_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
-        standard_step_dataset.push_to_hub(repo_id, config_name="standard_step")
+        standard_stepwise_supervision_dataset.push_to_hub(repo_id, config_name="standard_stepwise_supervision")
 
     conversational_language_modeling_dataset = Dataset.from_dict({
         "messages": [
@@ -398,7 +398,7 @@ def main(test_size, push_to_hub, repo_id):
             [{"role": "user", "content": "Any great ideas?"}, {"role": "assistant", "content": "Namespaces are one honking great idea."}],
         ],
     })
-    conversational_language_modeling_dataset = conversational_language_modeling_dataset.train_test_split(test_size=test_size)
+    conversational_language_modeling_dataset = conversational_language_modeling_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_language_modeling_dataset.push_to_hub(repo_id, config_name="conversational_language_modeling")
 
@@ -425,7 +425,7 @@ def main(test_size, push_to_hub, repo_id):
             [{"role": "user", "content": "Any great ideas?"}],
         ],
     })
-    conversational_prompt_only_dataset = conversational_prompt_only_dataset.train_test_split(test_size=test_size)
+    conversational_prompt_only_dataset = conversational_prompt_only_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_prompt_only_dataset.push_to_hub(repo_id, config_name="conversational_prompt_only")
 
@@ -473,7 +473,7 @@ def main(test_size, push_to_hub, repo_id):
             [{"role": "assistant", "content": "Namespaces are one honking great idea."}],
         ],
     })
-    conversational_prompt_completion_dataset = conversational_prompt_completion_dataset.train_test_split(test_size=test_size)
+    conversational_prompt_completion_dataset = conversational_prompt_completion_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_prompt_completion_dataset.push_to_hub(repo_id, config_name="conversational_prompt_completion")
 
@@ -542,7 +542,7 @@ def main(test_size, push_to_hub, repo_id):
             [{"role": "assistant", "content": "Recursion."}],
         ],
     })
-    conversational_preference_dataset = conversational_preference_dataset.train_test_split(test_size=test_size)
+    conversational_preference_dataset = conversational_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_preference_dataset.push_to_hub(repo_id, config_name="conversational_preference")
 
@@ -590,7 +590,7 @@ def main(test_size, push_to_hub, repo_id):
             [{"role": "user", "content": "Any great ideas?"}, {"role": "assistant", "content": "Recursion."}],
         ],
     })
-    conversational_implicit_prompt_preference_dataset = conversational_implicit_prompt_preference_dataset.train_test_split(test_size=test_size)
+    conversational_implicit_prompt_preference_dataset = conversational_implicit_prompt_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_implicit_prompt_preference_dataset.push_to_hub(repo_id, config_name="conversational_implicit_prompt_preference")
 
@@ -639,7 +639,7 @@ def main(test_size, push_to_hub, repo_id):
         ],
         "label": [True, True, True, False, True, True, True, False, True, False, True, False, True, False, False, True, True, True, True],
     })
-    conversational_unpaired_preference_dataset = conversational_unpaired_preference_dataset.train_test_split(test_size=test_size)
+    conversational_unpaired_preference_dataset = conversational_unpaired_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_unpaired_preference_dataset.push_to_hub(repo_id, config_name="conversational_unpaired_preference")
     # fmt: on
diff --git a/tests/test_bco_trainer.py b/tests/test_bco_trainer.py
@@ -160,10 +160,10 @@ def test_tokenize_and_process_tokens(self):
             self.assertListEqual(tokenized_dataset["prompt"], train_dataset["prompt"])
             self.assertListEqual(tokenized_dataset["completion"], train_dataset["completion"])
             self.assertListEqual(tokenized_dataset["label"], train_dataset["label"])
-            self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [31137])
-            self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1])
-            self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [374, 2664, 1091, 16965, 13])
-            self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1, 1, 1, 1])
+            self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091])
+            self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1, 1, 1, 1])
+            self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [27261, 13])
+            self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1])
 
             fn_kwargs = {
                 "prefix": "",
@@ -178,13 +178,15 @@ def test_tokenize_and_process_tokens(self):
             self.assertListEqual(processed_dataset["prompt"], train_dataset["prompt"])
             self.assertListEqual(processed_dataset["completion"], train_dataset["completion"])
             self.assertListEqual(processed_dataset["label"], train_dataset["label"])
-            self.assertListEqual(processed_dataset["prompt_input_ids"][0], [31137])
-            self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1])
+            self.assertListEqual(processed_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091])
+            self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1, 1, 1, 1])
             self.assertListEqual(
-                processed_dataset["completion_input_ids"][0], [31137, 374, 2664, 1091, 16965, 13, 151645]
+                processed_dataset["completion_input_ids"][0], [46518, 374, 2664, 1091, 27261, 13, 151645]
             )
             self.assertListEqual(processed_dataset["completion_attention_mask"][0], [1, 1, 1, 1, 1, 1, 1])
-            self.assertListEqual(processed_dataset["completion_labels"][0], [-100, 374, 2664, 1091, 16965, 13, 151645])
+            self.assertListEqual(
+                processed_dataset["completion_labels"][0], [-100, -100, -100, -100, 27261, 13, 151645]
+            )
 
     @require_sklearn
     def test_bco_trainer_without_providing_ref_model(self):
diff --git a/tests/test_kto_trainer.py b/tests/test_kto_trainer.py
@@ -156,10 +156,10 @@ def test_tokenize_and_process_tokens(self):
             self.assertListEqual(tokenized_dataset["prompt"], train_dataset["prompt"])
             self.assertListEqual(tokenized_dataset["completion"], train_dataset["completion"])
             self.assertListEqual(tokenized_dataset["label"], train_dataset["label"])
-            self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [31137])
-            self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1])
-            self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [374, 2664, 1091, 16965, 13])
-            self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1, 1, 1, 1])
+            self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091])
+            self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1, 1, 1, 1])
+            self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [27261, 13])
+            self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1])
 
             # Test corruption of (prompt, completion) pairs for KL dataset
             for batch_size in [2, 3]:
@@ -196,13 +196,15 @@ def test_tokenize_and_process_tokens(self):
             self.assertListEqual(processed_dataset["prompt"], train_dataset["prompt"])
             self.assertListEqual(processed_dataset["completion"], train_dataset["completion"])
             self.assertListEqual(processed_dataset["label"], train_dataset["label"])
-            self.assertListEqual(processed_dataset["prompt_input_ids"][0], [31137])
-            self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1])
+            self.assertListEqual(processed_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091])
+            self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1, 1, 1, 1])
             self.assertListEqual(
-                processed_dataset["completion_input_ids"][0], [31137, 374, 2664, 1091, 16965, 13, 151645]
+                processed_dataset["completion_input_ids"][0], [46518, 374, 2664, 1091, 27261, 13, 151645]
             )
             self.assertListEqual(processed_dataset["completion_attention_mask"][0], [1, 1, 1, 1, 1, 1, 1])
-            self.assertListEqual(processed_dataset["completion_labels"][0], [-100, 374, 2664, 1091, 16965, 13, 151645])
+            self.assertListEqual(
+                processed_dataset["completion_labels"][0], [-100, -100, -100, -100, 27261, 13, 151645]
+            )
 
     def test_kto_trainer_without_providing_ref_model(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -1172,7 +1172,7 @@ def test_sft_trainer_eval_packing(self):
             )
 
             self.assertEqual(len(trainer.train_dataset["input_ids"]), 46)  # w/ this dataset, we end up with 46 seqs
-            self.assertEqual(len(trainer.eval_dataset["input_ids"]), 5)  # w/ this dataset, we end up with 5 seqs
+            self.assertEqual(len(trainer.eval_dataset["input_ids"]), 6)  # w/ this dataset, we end up with 6 seqs
 
     def test_sft_trainer_no_packing(self):
         with tempfile.TemporaryDirectory() as tmp_dir:

Original file line number	Diff line number	Diff line change
`@@ -1172,7 +1172,7 @@ def test_sft_trainer_eval_packing(self):`
`1172`	`1172`	`)`
`1173`	`1173`
`1174`	`1174`	`self.assertEqual(len(trainer.train_dataset["input_ids"]), 46) # w/ this dataset, we end up with 46 seqs`
`1175`		`- self.assertEqual(len(trainer.eval_dataset["input_ids"]), 5) # w/ this dataset, we end up with 5 seqs`
	`1175`	`+ self.assertEqual(len(trainer.eval_dataset["input_ids"]), 6) # w/ this dataset, we end up with 6 seqs`
`1176`	`1176`
`1177`	`1177`	`def test_sft_trainer_no_packing(self):`
`1178`	`1178`	`with tempfile.TemporaryDirectory() as tmp_dir:`