Skip to content

Commit 8b28d3a

Browse files
authored
Merge pull request #62 from bigcode-project/rename-path-arguments
Rename path arguments
2 parents 705b007 + 4571452 commit 8b28d3a

File tree

5 files changed

+31
-23
lines changed

5 files changed

+31
-23
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ accelerate launch main.py \
8989
* `limit` represents the number of problems to solve, if it's not provided all problems in the benchmark are selected.
9090
* `allow_code_execution` is for executing the generated code: it is off by default, read the displayed warning before calling it to enable execution.
9191
* Some models with custom code on the HF hub like [SantaCoder](https://huggingface.co/bigcode/santacoder) require calling `--trust_remote_code`, for private models add `--use_auth_token`.
92-
* `save_generations` saves the post-processed generations in a json file. You can also save references by calling `--save_references`
92+
* `save_generations` saves the post-processed generations in a json file at `save_generations_path` (by default `generations.json`). You can also save references by calling `--save_references`
9393

9494
Some tasks don't require code execution such as
9595
`codexglue_code_to_text-<LANGUAGE>`/`codexglue_code_to_text-python-left`/`conala`/`concode` that use BLEU evaluation. In addition, we generate one candidate solution for each problem in these tasks, so use `n_samples=1` and `batch_size=1`. (Note that `batch_size` should always be equal or less than `n_samples`).
@@ -108,7 +108,7 @@ If you already have the generations in a json file from this evaluation harness
108108
Below is an example, be mind of specifying arguments proper to the task you are evaluating on, and note that `model` value here only serves for documenting the experiment.
109109

110110
```bash
111-
accelerate launch main.py --tasks mbpp --allow_code_execution --generations_path generations.json --model incoder-temperature-08
111+
accelerate launch main.py --tasks mbpp --allow_code_execution --load_generations_path generations.json --model incoder-temperature-08
112112
```
113113

114114
## Implementing new tasks

lm_eval/evaluator.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def __init__(self, accelerator, model, tokenizer, args):
3232
self.args = args
3333

3434
# setup arguments
35-
self.output_path = args.output_path
35+
self.metric_output_path = args.metric_output_path
3636

3737
# code evaluation permission
3838
self.allow_code_execution = args.allow_code_execution
@@ -67,19 +67,20 @@ def evaluate(self, task_name):
6767
generations, references = self.generate_text(task_name)
6868

6969
if self.accelerator.is_main_process:
70-
if not self.args.generations_path:
70+
if not self.args.load_generations_path:
7171
if self.args.save_generations:
72-
with open("generations.json", "w") as fp:
72+
with open(self.args.save_generations_path, "w") as fp:
7373
json.dump(generations, fp)
74-
print("generations were saved")
74+
print(f"generations were saved at {self.args.save_generations_path}")
7575
if self.args.save_references:
7676
with open("references.json", "w") as fp:
7777
json.dump(references, fp)
78-
print("references were saved")
78+
print("references were saved at references.json")
7979

8080
# make sure tokenizer plays nice with multiprocessing
8181
os.environ["TOKENIZERS_PARALLELISM"] = "false"
8282
if self.allow_code_execution and task.requires_execution:
8383
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
84+
print("Evaluating generations...")
8485
results = task.process_results(generations, references)
8586
return results

lm_eval/generation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@ def __call__(self, input_ids, scores, **kwargs):
3636

3737

3838
def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args):
39-
if args.generations_path:
39+
if args.load_generations_path:
4040
# load generated code
41-
with open(args.generations_path) as fp:
41+
with open(args.load_generations_path) as fp:
4242
generations = json.load(fp)
4343
if accelerator.is_main_process:
4444
print(

main.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ def parse_args():
7070
default=512,
7171
help="Maximum length of generated sequence (prompt+generation)",
7272
)
73+
parser.add_argument(
74+
"--precision",
75+
type=str,
76+
default="fp32",
77+
help="Model precision, from: fp32, fp16 or bf16",
78+
)
7379
parser.add_argument(
7480
"--limit",
7581
type=int,
@@ -92,19 +98,13 @@ def parse_args():
9298
help="Do code generation but no evaluation",
9399
)
94100
parser.add_argument(
95-
"--precision",
96-
type=str,
97-
default="fp32",
98-
help="Model precision, from: fp32, fp16 or bf16",
99-
)
100-
parser.add_argument(
101-
"--generations_path",
101+
"--load_generations_path",
102102
type=str,
103103
default=None,
104104
help="Path of file with previously generated solutions, if provided generation is skipped and only evaluation is done",
105105
)
106106
parser.add_argument(
107-
"--output_path",
107+
"--metric_output_path",
108108
type=str,
109109
default="evaluation_results.json",
110110
help="Path to save the results",
@@ -114,6 +114,12 @@ def parse_args():
114114
action="store_true",
115115
help="Whether to save code generations",
116116
)
117+
parser.add_argument(
118+
"--save_generations_path",
119+
type=str,
120+
default="generations.json",
121+
help="Path for saving the code generations",
122+
)
117123
parser.add_argument(
118124
"--save_references",
119125
action="store_true",
@@ -147,7 +153,7 @@ def main():
147153
print(f"Selected Tasks: {task_names}")
148154

149155
results = {}
150-
if args.generations_path:
156+
if args.load_generations_path:
151157
# here we don't generate code but only evaluate previously computed generations
152158
if accelerator.is_main_process:
153159
print("evaluation only mode")
@@ -199,13 +205,13 @@ def main():
199205
else:
200206
results[task] = evaluator.evaluate(task)
201207

202-
results["config"] = {"model": args.model}
208+
results["config"] = {"model": args.model, "temperature": args.temperature, "n_samples": args.n_samples}
203209
if not args.generation_only:
204210
dumped = json.dumps(results, indent=2)
205211
if accelerator.is_main_process:
206212
print(dumped)
207213

208-
with open(args.output_path, "w") as f:
214+
with open(args.metric_output_path, "w") as f:
209215
f.write(dumped)
210216

211217

tests/test_generation_evaluation.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,10 @@ def update_args(args):
2929
# the executed code for the tests is safe (see tests/data/*_eval_gens.json)
3030
args.allow_code_execution = True
3131
args.save_generations = False
32+
args.save_generations_path = ""
3233
args.save_references = False
33-
args.output_path = TMPDIR
34-
args.generations_path = None
34+
args.metric_output_path = TMPDIR
35+
args.load_generations_path = None
3536
args.generation_only = False
3637
# postprocessing for HumanEval and MBPP makes generations
3738
# with dummy model not distinctive
@@ -90,7 +91,7 @@ def test_evaluation():
9091
for task in EVAL_TASKS:
9192
print(f"testing task {task}")
9293
# path to generation examples to evaluate
93-
args.generations_path = f"tests/data/{task}_eval_gens.json"
94+
args.load_generations_path = f"tests/data/{task}_eval_gens.json"
9495
evaluator = Evaluator(accelerator, None, None, args)
9596
results = evaluator.evaluate(task)
9697
assert results == REF_EVAL_SCORES[task]

0 commit comments

Comments
 (0)