Merge pull request #62 from bigcode-project/rename-path-arguments

loubnabnl · web-flow · commit 8b28d3ae5554 · 2023-04-21T19:15:00.000+02:00
Rename path arguments
diff --git a/README.md b/README.md
@@ -89,7 +89,7 @@ accelerate launch  main.py \
 * `limit` represents the number of problems to solve, if it's not provided all problems in the benchmark are selected. 
 * `allow_code_execution` is for executing the generated code: it is off by default, read the displayed warning before calling it to enable execution. 
 * Some models with custom code on the HF hub like [SantaCoder](https://huggingface.co/bigcode/santacoder) require calling `--trust_remote_code`, for private models add `--use_auth_token`.
-* `save_generations` saves the post-processed generations in a json file. You can also save references by calling `--save_references`
+* `save_generations` saves the post-processed generations in a json file at `save_generations_path` (by default `generations.json`). You can also save references by calling `--save_references`
 
 Some tasks don't require code execution such as
 `codexglue_code_to_text-<LANGUAGE>`/`codexglue_code_to_text-python-left`/`conala`/`concode` that use BLEU evaluation. In addition, we generate one candidate solution for each problem in these tasks, so use `n_samples=1` and `batch_size=1`. (Note that `batch_size` should always be equal or less than `n_samples`).
@@ -108,7 +108,7 @@ If you already have the generations in a json file from this evaluation harness
 Below is an example, be mind of specifying arguments proper to the task you are evaluating on, and note that `model` value here only serves for documenting the experiment.
 
 ```bash
-accelerate launch  main.py   --tasks mbpp  --allow_code_execution  --generations_path generations.json  --model incoder-temperature-08
+accelerate launch  main.py   --tasks mbpp  --allow_code_execution  --load_generations_path generations.json  --model incoder-temperature-08
 ```
 
 ## Implementing new tasks
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
@@ -32,7 +32,7 @@ def __init__(self, accelerator, model, tokenizer, args):
         self.args = args
 
         # setup arguments
-        self.output_path = args.output_path
+        self.metric_output_path = args.metric_output_path
 
         # code evaluation permission
         self.allow_code_execution = args.allow_code_execution
@@ -67,19 +67,20 @@ def evaluate(self, task_name):
         generations, references = self.generate_text(task_name)
 
         if self.accelerator.is_main_process:
-            if not self.args.generations_path:
+            if not self.args.load_generations_path:
                 if self.args.save_generations:
-                    with open("generations.json", "w") as fp:
+                    with open(self.args.save_generations_path, "w") as fp:
                         json.dump(generations, fp)
-                        print("generations were saved")
+                        print(f"generations were saved at {self.args.save_generations_path}")
                 if self.args.save_references:
                     with open("references.json", "w") as fp:
                         json.dump(references, fp)
-                        print("references were saved")
+                        print("references were saved at references.json")
 
             # make sure tokenizer plays nice with multiprocessing
             os.environ["TOKENIZERS_PARALLELISM"] = "false"
             if self.allow_code_execution and task.requires_execution:
                 os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+            print("Evaluating generations...")
             results = task.process_results(generations, references)
             return results
diff --git a/lm_eval/generation.py b/lm_eval/generation.py
@@ -36,9 +36,9 @@ def __call__(self, input_ids, scores, **kwargs):
 
 
 def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args):
-    if args.generations_path:
+    if args.load_generations_path:
         # load generated code
-        with open(args.generations_path) as fp:
+        with open(args.load_generations_path) as fp:
             generations = json.load(fp)
             if accelerator.is_main_process:
                 print(
diff --git a/main.py b/main.py
@@ -70,6 +70,12 @@ def parse_args():
         default=512,
         help="Maximum length of generated sequence (prompt+generation)",
     )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default="fp32",
+        help="Model precision, from: fp32, fp16 or bf16",
+    )
     parser.add_argument(
         "--limit",
         type=int,
@@ -92,19 +98,13 @@ def parse_args():
         help="Do code generation but no evaluation",
     )
     parser.add_argument(
-        "--precision",
-        type=str,
-        default="fp32",
-        help="Model precision, from: fp32, fp16 or bf16",
-    )
-    parser.add_argument(
-        "--generations_path",
+        "--load_generations_path",
         type=str,
         default=None,
         help="Path of file with previously generated solutions, if provided generation is skipped and only evaluation is done",
     )
     parser.add_argument(
-        "--output_path",
+        "--metric_output_path",
         type=str,
         default="evaluation_results.json",
         help="Path to save the results",
@@ -114,6 +114,12 @@ def parse_args():
         action="store_true",
         help="Whether to save code generations",
     )
+    parser.add_argument(
+        "--save_generations_path",
+        type=str,
+        default="generations.json",
+        help="Path for saving the code generations",
+    )
     parser.add_argument(
         "--save_references",
         action="store_true",
@@ -147,7 +153,7 @@ def main():
         print(f"Selected Tasks: {task_names}")
 
     results = {}
-    if args.generations_path:
+    if args.load_generations_path:
         # here we don't generate code but only evaluate previously computed generations
         if accelerator.is_main_process:
             print("evaluation only mode")
@@ -199,13 +205,13 @@ def main():
             else:
                 results[task] = evaluator.evaluate(task)
 
-    results["config"] = {"model": args.model}
+    results["config"] = {"model": args.model, "temperature": args.temperature, "n_samples": args.n_samples}
     if not args.generation_only:
         dumped = json.dumps(results, indent=2)
         if accelerator.is_main_process:
             print(dumped)
 
-        with open(args.output_path, "w") as f:
+        with open(args.metric_output_path, "w") as f:
             f.write(dumped)
 
 
diff --git a/tests/test_generation_evaluation.py b/tests/test_generation_evaluation.py
@@ -29,9 +29,10 @@ def update_args(args):
     # the executed code for the tests is safe (see tests/data/*_eval_gens.json)
     args.allow_code_execution = True
     args.save_generations = False
+    args.save_generations_path = ""
     args.save_references = False
-    args.output_path = TMPDIR
-    args.generations_path = None
+    args.metric_output_path = TMPDIR
+    args.load_generations_path = None
     args.generation_only = False
     # postprocessing for HumanEval and MBPP makes generations
     # with dummy model not distinctive
@@ -90,7 +91,7 @@ def test_evaluation():
     for task in EVAL_TASKS:
         print(f"testing task {task}")
         # path to generation examples to evaluate
-        args.generations_path = f"tests/data/{task}_eval_gens.json"
+        args.load_generations_path = f"tests/data/{task}_eval_gens.json"
         evaluator = Evaluator(accelerator, None, None, args)
         results = evaluator.evaluate(task)
         assert results == REF_EVAL_SCORES[task]