From 2d96e47f059e3b7d92f1573359a57adfa4b787b0 Mon Sep 17 00:00:00 2001 From: Jean Mercat Date: Tue, 19 Nov 2024 10:09:04 -0800 Subject: [PATCH 1/3] change slicing in compute, change rank to accelerator.process_index --- eval/chat_benchmarks/HumanEval/eval_instruct.py | 2 +- eval/chat_benchmarks/IFEval/eval_instruct.py | 2 +- eval/chat_benchmarks/MBPP/eval_instruct.py | 2 +- eval/chat_benchmarks/MTBench/eval_instruct.py | 2 +- .../fastchat/modules/xfastertransformer.py | 2 +- eval/chat_benchmarks/MixEval/eval_instruct.py | 8 ++++---- eval/chat_benchmarks/RepoBench/eval_instruct.py | 15 +++++++++------ eval/chat_benchmarks/WildBench/eval_instruct.py | 2 +- .../chat_benchmarks/alpaca_eval/eval_instruct.py | 2 +- ...ighted_alpaca_eval_gpt4_turbo_leaderboard.csv | 8 +++++--- .../baseline_gpt4_1106_preview.csv | 2 ++ eval/chat_benchmarks/zeroeval/eval_instruct.py | 2 +- eval/eval.py | 4 ++-- eval/task.py | 16 ++++++++-------- 14 files changed, 38 insertions(+), 31 deletions(-) diff --git a/eval/chat_benchmarks/HumanEval/eval_instruct.py b/eval/chat_benchmarks/HumanEval/eval_instruct.py index ad705099..17149ef6 100644 --- a/eval/chat_benchmarks/HumanEval/eval_instruct.py +++ b/eval/chat_benchmarks/HumanEval/eval_instruct.py @@ -112,7 +112,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: self.logger.info("Generating responses for Human Eval...") outputs = self.compute(model, all_instances) - if model.rank != 0: + if model.accelerator.process_index != 0: continue generated_examples = [] diff --git a/eval/chat_benchmarks/IFEval/eval_instruct.py b/eval/chat_benchmarks/IFEval/eval_instruct.py index 7e500dbe..6f5f0b7d 100644 --- a/eval/chat_benchmarks/IFEval/eval_instruct.py +++ b/eval/chat_benchmarks/IFEval/eval_instruct.py @@ -115,7 +115,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: self.logger.info("Generating responses...") outputs = self.compute(model, all_instances) - if model.rank != 0: + if model.accelerator.process_index != 0: return None generated_examples = [] diff --git a/eval/chat_benchmarks/MBPP/eval_instruct.py b/eval/chat_benchmarks/MBPP/eval_instruct.py index 875c8bac..eff2af9d 100644 --- a/eval/chat_benchmarks/MBPP/eval_instruct.py +++ b/eval/chat_benchmarks/MBPP/eval_instruct.py @@ -161,7 +161,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: outputs = self.compute(model, all_instances) # Return None early for non-primary ranks - if model.rank != 0: + if model.accelerator.process_index != 0: return None generated_examples = [] diff --git a/eval/chat_benchmarks/MTBench/eval_instruct.py b/eval/chat_benchmarks/MTBench/eval_instruct.py index 202662ac..b35a58c0 100644 --- a/eval/chat_benchmarks/MTBench/eval_instruct.py +++ b/eval/chat_benchmarks/MTBench/eval_instruct.py @@ -151,7 +151,7 @@ def get_model_answers(self, model: LM, model_id: str, questions: List[Dict[str, all_convs[q_idx].append({"role": "assistant", "content": output}) all_choices[q_idx]["turns"].append(output) - if model.rank != 0: + if model.accelerator.process_index != 0: continue # Save completed conversations diff --git a/eval/chat_benchmarks/MTBench/fastchat/modules/xfastertransformer.py b/eval/chat_benchmarks/MTBench/fastchat/modules/xfastertransformer.py index 8c95d4d7..557ec4c4 100644 --- a/eval/chat_benchmarks/MTBench/fastchat/modules/xfastertransformer.py +++ b/eval/chat_benchmarks/MTBench/fastchat/modules/xfastertransformer.py @@ -36,7 +36,7 @@ def load_xft_model(model_path, xft_config: XftConfig): tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, padding_side="left", trust_remote_code=True) xft_model = xfastertransformer.AutoModel.from_pretrained(model_path, dtype=data_type) model = XftModel(xft_model=xft_model, xft_config=xft_config) - if model.model.rank > 0: + if model.model.accelerator.process_index > 0: while True: model.model.generate() return model, tokenizer diff --git a/eval/chat_benchmarks/MixEval/eval_instruct.py b/eval/chat_benchmarks/MixEval/eval_instruct.py index d92c7e8e..ff56f47b 100644 --- a/eval/chat_benchmarks/MixEval/eval_instruct.py +++ b/eval/chat_benchmarks/MixEval/eval_instruct.py @@ -135,7 +135,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: for split in splits: self.args.split = split all_results = self._eval_split(model, split) - if model.rank == 0: + if model.accelerator.process_index == 0: response_file = self._get_response_file() with open(response_file, "w") as f: for result in all_results: @@ -143,7 +143,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: out_dict[split] = all_results # Only return results on rank 0 - if model.world_size > 1 and model.rank != 0: + if model.world_size > 1 and model.accelerator.process_index != 0: return None return out_dict @@ -192,7 +192,7 @@ def _eval_split(self, model: LM, split: str) -> List[Dict[str, Any]]: for idx in list(range(len(eval_dataset.raw_inputs))): eval_dataset.raw_inputs[idx]["response"] = all_responses[idx] - if model.rank == 0: + if model.accelerator.process_index == 0: with open(response_file, "w") as f: for item in eval_dataset.raw_inputs: json_line = json.dumps(item) @@ -243,7 +243,7 @@ def run_benchmark(self, model: LM) -> Dict[str, Any]: generation_results = self.generate_responses(model) # Only evaluate on rank 0 - if model.world_size > 1 and model.rank != 0: + if model.world_size > 1 and model.accelerator.process_index != 0: return None evaluation_results = self.evaluate_responses(generation_results) diff --git a/eval/chat_benchmarks/RepoBench/eval_instruct.py b/eval/chat_benchmarks/RepoBench/eval_instruct.py index cb207c10..cc2251ce 100644 --- a/eval/chat_benchmarks/RepoBench/eval_instruct.py +++ b/eval/chat_benchmarks/RepoBench/eval_instruct.py @@ -59,7 +59,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: if self.legacy_mode: return self._generate_responses_legacy(model) - if model.rank == 0: + if model.accelerator.process_index == 0: temp_dir_obj = tempfile.TemporaryDirectory() temp_dir = temp_dir_obj.name @@ -76,8 +76,11 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: all_instances = [] # Split dataset across ranks for parallel construction - # Get subset of dataset for this rank using built-in slice functionality - rank_dataset = list(islice(dataset, model.rank, len(dataset), model.world_size)) + # Get subset of dataset for this rank using the same slicing strategy as the compute function + chunk_size = len(dataset) // model.world_size + start = model.accelerator.process_index * chunk_size + end = start + chunk_size if model.accelerator.process_index < model.world_size - 1 else len(dataset) + rank_dataset = dataset.select(range(start, end)) # Process examples for this rank's shard for idx, example in enumerate(rank_dataset): @@ -100,7 +103,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: outputs = self.compute(model, all_instances, do_slice=False) # Only rank 0 should save the results - if model.rank != 0: + if model.accelerator.process_indexlerator.process_index != 0: continue generated_examples = [] @@ -118,7 +121,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: for ex in generated_examples: fw.write(json.dumps(ex) + "\n") - if model.rank == 0: + if model.accelerator.process_index == 0: return {"temp_dir_obj": temp_dir_obj} def _generate_responses_legacy(self, model: LM) -> Dict[str, Any]: @@ -156,7 +159,7 @@ def _generate_responses_legacy(self, model: LM) -> Dict[str, Any]: outputs = self.compute(model, all_instances, do_slice=False) - if model.rank != 0: + if model.accelerator.process_index != 0: continue generated_examples = [] diff --git a/eval/chat_benchmarks/WildBench/eval_instruct.py b/eval/chat_benchmarks/WildBench/eval_instruct.py index 100e6d68..f909b206 100644 --- a/eval/chat_benchmarks/WildBench/eval_instruct.py +++ b/eval/chat_benchmarks/WildBench/eval_instruct.py @@ -196,7 +196,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: outputs = self.compute(model, all_instances) # Return None early for non-primary ranks - if model.rank != 0: + if model.accelerator.process_index != 0: return None outputs = [[output] for output in outputs] diff --git a/eval/chat_benchmarks/alpaca_eval/eval_instruct.py b/eval/chat_benchmarks/alpaca_eval/eval_instruct.py index 0a79b1e8..f1d6170b 100644 --- a/eval/chat_benchmarks/alpaca_eval/eval_instruct.py +++ b/eval/chat_benchmarks/alpaca_eval/eval_instruct.py @@ -117,7 +117,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: self.logger.info("Generating responses for Alpaca Eval...") outputs = self.compute(model, all_instances) - if model.rank != 0: + if model.accelerator.process_index != 0: return None model_outputs = [] diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv index 25a5c698..01881528 100644 --- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv +++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv @@ -1,9 +1,10 @@ ,win_rate,standard_error,n_wins,n_wins_base,n_draws,n_total,discrete_win_rate,mode,avg_length,length_controlled_winrate,lc_standard_error -Shopee-SlimMoA-v1,75.61428659805350,1.2706274059194700,621,184,0,805,77.14285714285720,community,1994,77.4515432873834,0.43017522149239600 -blendaxai-gm-l6-vo31,69.11033492869565,1.3280735654354863,562,242,1,805,69.87577639751554,community,1809,76.91981221023656,0.5725365663132986 +Shopee-SlimMoA-v1,75.6142865980535,1.27062740591947,621,184,0,805,77.1428571428572,community,1994,77.4515432873834,0.430175221492396 +blendaxai-gm-l6-vo31,69.11033492869565,1.3280735654354865,562,242,1,805,69.87577639751554,community,1809,76.91981221023656,0.5725365663132986 gemma-2-9b-it-WPO-HB,77.82503168985093,1.2355857177790277,640,163,2,805,79.62732919254658,community,2285,76.72506842726064,0.4242603928637889 blendaxai-gm-l3-v35,73.41035740244067,1.254951147343878,607,196,2,805,75.527950310559,community,2186,73.37270365010379,0.6163911450738288 gemma-2-9b-it-SimPO,65.86422561532919,1.423459922555078,540,264,1,805,67.14285714285714,community,1833,72.3508446939842,0.5167873784867067 +model_hf_model_args_pretrained=mlfoundations-dev__gemma-simpo-reproduction,67.35102937013792,1.4210070002869848,557,247,1,805,69.25465838509317,community,1950,71.18995900084634,0.5756949353655318 openpipe-moa-gpt-4-turbo-v1,63.15493451236265,1.422980098799326,515,283,7,805,64.40993788819875,community,1856,68.37866250336802,0.7309418614587613 gemma-2-9b-it-DPO,65.35922380122982,1.402802336467638,536,268,1,805,66.64596273291924,community,2016,67.6620382198043,0.6605613085864308 Together-MoA,59.8688062333292,1.434305604543079,490,314,1,805,60.93167701863354,community,1825,65.37996976852163,0.7392392836781445 @@ -22,7 +23,7 @@ gpt4_1106_preview_verbose,64.30360147101865,1.3348590089025316,525,268,12,805,65 gpt-4o-mini-2024-07-18,44.65413862507926,1.4572395578449813,350,451,4,805,43.72670807453416,minimal,1861,50.727144855901976,0.8284734951761676 Storm-7B,50.26886905528583,1.4728176780737183,397,408,0,805,49.31677018633541,community,2045,50.45110959343775, gpt4_1106_preview,50.0,0.0,0,0,805,805,50.0,minimal,2049,50.0, -REBEL-Llama-3-8B-Instruct-Armo,48.43655307668638,1.480341435123528,394,410,1,805,49.006211180124225,community,1965,49.314293536857114,0.7061879308002301 +REBEL-Llama-3-8B-Instruct-Armo,48.43655307668638,1.480341435123528,394,410,1,805,49.00621118012423,community,1965,49.31429353685712,0.7061879308002301 Infinity-Instruct-7M-Gen-Llama3_1-70B,37.46327383827497,1.4734130373862548,299,501,5,805,37.453416149068325,community,1654,46.10043331712677,0.822439983375277 Llama-3-Instruct-8B-SimPO-ExPO,40.63285400856655,1.4439449942168028,325,479,1,805,40.43478260869565,community,1765,45.78021783946177, Llama-3-Instruct-8B-SimPO,40.52977498461182,1.422574464675002,319,485,1,805,39.68944099378882,community,1825,44.65131348921881,0.8800655791760451 @@ -209,3 +210,4 @@ guanaco-13b,3.469596859739131,0.5518606725700214,22,780,3,805,2.919254658385093, guanaco-7b,2.880002266173913,0.5202924149314048,21,783,1,805,2.670807453416149,verified,1364,2.871116813131697, Qwen1.5-1.8B-Chat,3.70555681579365,0.5811750995496215,27,774,3,804,3.544776119402985,verified,2673,2.588498849185137, baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568, +model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,0.005260368511326853,0.0018774672393365112,0,805,0,805,0.0,community,196,0.010252829751292214,0.0007495965900756891 diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv index ad2e31af..7d6fa028 100644 --- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv +++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv @@ -186,3 +186,5 @@ Mistral-7B-Instruct-v0.3,-1.5007159011881868,0.9845683091847074,-1.7652759895328 Shopee-SlimMoA-v1,-0.6930943742294789,0.5778443790027642,1.4506276222723822 blendaxai-gm-l6-vo31,-1.4827230167114802,0.8256378421072179,1.5942312525409852 REBEL-Llama-3-8B-Instruct-Armo,-1.0427168605260002,0.6464073051877255,0.0395191056877229 +model_hf_model_args_pretrained=mlfoundations-dev__gemma-simpo-reproduction,-1.1818376919023723,0.6835318362039150,1.1479555832649320 +model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,-1.8345282763259563,0.7434213717748921,-9.8937244442602008 diff --git a/eval/chat_benchmarks/zeroeval/eval_instruct.py b/eval/chat_benchmarks/zeroeval/eval_instruct.py index c68a4379..e5e73c3e 100644 --- a/eval/chat_benchmarks/zeroeval/eval_instruct.py +++ b/eval/chat_benchmarks/zeroeval/eval_instruct.py @@ -144,7 +144,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: outputs = self.compute(model, all_instances) - if model.rank != 0: + if model.accelerator.process_index != 0: continue outputs = [[output] for output in outputs] diff --git a/eval/eval.py b/eval/eval.py index 5bd177cf..2c0d5682 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -148,7 +148,7 @@ def evaluate( cpu_count = os.cpu_count() max_workers = min(len(valid_tasks), cpu_count * 2) - if lm.world_size <= 1 or lm.rank == 0: + if lm.world_size <= 1 or lm.accelerator.process_index == 0: with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: evaluate_results = list( executor.map( @@ -302,7 +302,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: ) # Add metadata to results - if lm.rank == 0: + if lm.accelerator.process_index == 0: add_results_metadata(results, args, lm) handle_evaluation_output(results, args, evaluation_tracker, wandb_logger) diff --git a/eval/task.py b/eval/task.py index aec96eca..337dd6d4 100644 --- a/eval/task.py +++ b/eval/task.py @@ -20,23 +20,23 @@ def __init__(self, logger: Optional[logging.Logger] = None): def compute(self, model: LM, inputs: List[Instance], do_slice: bool = True) -> List[str]: if model.world_size > 1 and do_slice: - prompts = list(islice(inputs, model.rank, len(inputs), model.world_size)) + chunk_size = len(inputs) // model.world_size + start = model.accelerator.process_index * chunk_size + end = start + chunk_size if model.accelerator.process_index < model.world_size - 1 else len(inputs) + prompts = inputs[start:end] else: prompts = inputs results = model.generate_until(prompts) if model.world_size > 1: all_results = [None for _ in range(model.world_size)] - dist.all_gather_object(all_results, results) - # Merge results from all ranks - length = sum(len(res) for res in all_results if res is not None) - merged = [None] * length - for rank, sub_results in enumerate(all_results): + # Simply concatenate results in rank order + merged = [] + for sub_results in all_results: if sub_results is not None: - for i, item in enumerate(sub_results): - merged[i * model.world_size + rank] = item + merged.extend(sub_results) return merged else: return results From aa7b45189bb411eb305ce3ae828fc1e8066b9838 Mon Sep 17 00:00:00 2001 From: Negin Raoof Date: Thu, 21 Nov 2024 20:53:54 -0600 Subject: [PATCH 2/3] fix for single process --- .../HumanEval/eval_instruct.py | 3 +- eval/chat_benchmarks/IFEval/eval_instruct.py | 3 +- eval/chat_benchmarks/MBPP/eval_instruct.py | 3 +- eval/chat_benchmarks/MTBench/eval_instruct.py | 3 +- eval/chat_benchmarks/MixEval/eval_instruct.py | 12 ++- .../RepoBench/eval_instruct.py | 19 ++-- .../WildBench/eval_instruct.py | 3 +- .../alpaca_eval/eval_instruct.py | 3 +- ...ted_alpaca_eval_gpt4_turbo_leaderboard.csv | 10 +- .../baseline_gpt4_1106_preview.csv | 2 +- .../chat_benchmarks/zeroeval/eval_instruct.py | 3 +- eval/eval.py | 102 +++++++++++++++++- 12 files changed, 140 insertions(+), 26 deletions(-) diff --git a/eval/chat_benchmarks/HumanEval/eval_instruct.py b/eval/chat_benchmarks/HumanEval/eval_instruct.py index 17149ef6..85d5f066 100644 --- a/eval/chat_benchmarks/HumanEval/eval_instruct.py +++ b/eval/chat_benchmarks/HumanEval/eval_instruct.py @@ -112,7 +112,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: self.logger.info("Generating responses for Human Eval...") outputs = self.compute(model, all_instances) - if model.accelerator.process_index != 0: + is_main_process = lm.accelerator.process_index == 0 if hasattr(lm, 'accelerator') else lm.world_size <= 1 + if not is_main_process: continue generated_examples = [] diff --git a/eval/chat_benchmarks/IFEval/eval_instruct.py b/eval/chat_benchmarks/IFEval/eval_instruct.py index 6f5f0b7d..5c7b1d8e 100644 --- a/eval/chat_benchmarks/IFEval/eval_instruct.py +++ b/eval/chat_benchmarks/IFEval/eval_instruct.py @@ -115,7 +115,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: self.logger.info("Generating responses...") outputs = self.compute(model, all_instances) - if model.accelerator.process_index != 0: + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 + if not is_main_process: return None generated_examples = [] diff --git a/eval/chat_benchmarks/MBPP/eval_instruct.py b/eval/chat_benchmarks/MBPP/eval_instruct.py index eff2af9d..6d589bf6 100644 --- a/eval/chat_benchmarks/MBPP/eval_instruct.py +++ b/eval/chat_benchmarks/MBPP/eval_instruct.py @@ -161,7 +161,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: outputs = self.compute(model, all_instances) # Return None early for non-primary ranks - if model.accelerator.process_index != 0: + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 + if not is_main_process: return None generated_examples = [] diff --git a/eval/chat_benchmarks/MTBench/eval_instruct.py b/eval/chat_benchmarks/MTBench/eval_instruct.py index b35a58c0..7a61f238 100644 --- a/eval/chat_benchmarks/MTBench/eval_instruct.py +++ b/eval/chat_benchmarks/MTBench/eval_instruct.py @@ -151,7 +151,8 @@ def get_model_answers(self, model: LM, model_id: str, questions: List[Dict[str, all_convs[q_idx].append({"role": "assistant", "content": output}) all_choices[q_idx]["turns"].append(output) - if model.accelerator.process_index != 0: + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 + if not is_main_process: continue # Save completed conversations diff --git a/eval/chat_benchmarks/MixEval/eval_instruct.py b/eval/chat_benchmarks/MixEval/eval_instruct.py index ff56f47b..f15f2b52 100644 --- a/eval/chat_benchmarks/MixEval/eval_instruct.py +++ b/eval/chat_benchmarks/MixEval/eval_instruct.py @@ -132,10 +132,12 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: out_dict = {} self.logger.info("Generating responses for MixEval...") + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 + for split in splits: self.args.split = split all_results = self._eval_split(model, split) - if model.accelerator.process_index == 0: + if is_main_process: response_file = self._get_response_file() with open(response_file, "w") as f: for result in all_results: @@ -143,7 +145,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: out_dict[split] = all_results # Only return results on rank 0 - if model.world_size > 1 and model.accelerator.process_index != 0: + if not is_main_process: return None return out_dict @@ -192,7 +194,8 @@ def _eval_split(self, model: LM, split: str) -> List[Dict[str, Any]]: for idx in list(range(len(eval_dataset.raw_inputs))): eval_dataset.raw_inputs[idx]["response"] = all_responses[idx] - if model.accelerator.process_index == 0: + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 + if is_main_process: with open(response_file, "w") as f: for item in eval_dataset.raw_inputs: json_line = json.dumps(item) @@ -243,7 +246,8 @@ def run_benchmark(self, model: LM) -> Dict[str, Any]: generation_results = self.generate_responses(model) # Only evaluate on rank 0 - if model.world_size > 1 and model.accelerator.process_index != 0: + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 + if not is_main_process: return None evaluation_results = self.evaluate_responses(generation_results) diff --git a/eval/chat_benchmarks/RepoBench/eval_instruct.py b/eval/chat_benchmarks/RepoBench/eval_instruct.py index cc2251ce..ac89a691 100644 --- a/eval/chat_benchmarks/RepoBench/eval_instruct.py +++ b/eval/chat_benchmarks/RepoBench/eval_instruct.py @@ -59,7 +59,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: if self.legacy_mode: return self._generate_responses_legacy(model) - if model.accelerator.process_index == 0: + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 + if is_main_process: temp_dir_obj = tempfile.TemporaryDirectory() temp_dir = temp_dir_obj.name @@ -77,10 +78,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: all_instances = [] # Split dataset across ranks for parallel construction # Get subset of dataset for this rank using the same slicing strategy as the compute function - chunk_size = len(dataset) // model.world_size - start = model.accelerator.process_index * chunk_size - end = start + chunk_size if model.accelerator.process_index < model.world_size - 1 else len(dataset) - rank_dataset = dataset.select(range(start, end)) + if hasattr(model, 'accelerator'): + chunk_size = len(dataset) // model.world_size + start = model.accelerator.process_index * chunk_size + end = start + chunk_size if model.accelerator.process_index < model.world_size - 1 else len(dataset) + rank_dataset = dataset.select(range(start, end)) + else: + rank_dataset = list(islice(dataset, model.rank, len(dataset), model.world_size)) # Process examples for this rank's shard for idx, example in enumerate(rank_dataset): @@ -103,7 +107,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: outputs = self.compute(model, all_instances, do_slice=False) # Only rank 0 should save the results - if model.accelerator.process_indexlerator.process_index != 0: + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 + if not is_main_process: continue generated_examples = [] @@ -121,7 +126,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: for ex in generated_examples: fw.write(json.dumps(ex) + "\n") - if model.accelerator.process_index == 0: + if is_main_process: return {"temp_dir_obj": temp_dir_obj} def _generate_responses_legacy(self, model: LM) -> Dict[str, Any]: diff --git a/eval/chat_benchmarks/WildBench/eval_instruct.py b/eval/chat_benchmarks/WildBench/eval_instruct.py index f909b206..2ff69cea 100644 --- a/eval/chat_benchmarks/WildBench/eval_instruct.py +++ b/eval/chat_benchmarks/WildBench/eval_instruct.py @@ -196,7 +196,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: outputs = self.compute(model, all_instances) # Return None early for non-primary ranks - if model.accelerator.process_index != 0: + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 + if not is_main_process: return None outputs = [[output] for output in outputs] diff --git a/eval/chat_benchmarks/alpaca_eval/eval_instruct.py b/eval/chat_benchmarks/alpaca_eval/eval_instruct.py index f1d6170b..09f86da4 100644 --- a/eval/chat_benchmarks/alpaca_eval/eval_instruct.py +++ b/eval/chat_benchmarks/alpaca_eval/eval_instruct.py @@ -117,7 +117,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: self.logger.info("Generating responses for Alpaca Eval...") outputs = self.compute(model, all_instances) - if model.accelerator.process_index != 0: + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 + if not is_main_process: return None model_outputs = [] diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv index 01881528..185f33c5 100644 --- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv +++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv @@ -1,10 +1,9 @@ ,win_rate,standard_error,n_wins,n_wins_base,n_draws,n_total,discrete_win_rate,mode,avg_length,length_controlled_winrate,lc_standard_error -Shopee-SlimMoA-v1,75.6142865980535,1.27062740591947,621,184,0,805,77.1428571428572,community,1994,77.4515432873834,0.430175221492396 -blendaxai-gm-l6-vo31,69.11033492869565,1.3280735654354865,562,242,1,805,69.87577639751554,community,1809,76.91981221023656,0.5725365663132986 +Shopee-SlimMoA-v1,75.61428659805350,1.2706274059194700,621,184,0,805,77.14285714285720,community,1994,77.4515432873834,0.43017522149239600 +blendaxai-gm-l6-vo31,69.11033492869565,1.3280735654354863,562,242,1,805,69.87577639751554,community,1809,76.91981221023656,0.5725365663132986 gemma-2-9b-it-WPO-HB,77.82503168985093,1.2355857177790277,640,163,2,805,79.62732919254658,community,2285,76.72506842726064,0.4242603928637889 blendaxai-gm-l3-v35,73.41035740244067,1.254951147343878,607,196,2,805,75.527950310559,community,2186,73.37270365010379,0.6163911450738288 gemma-2-9b-it-SimPO,65.86422561532919,1.423459922555078,540,264,1,805,67.14285714285714,community,1833,72.3508446939842,0.5167873784867067 -model_hf_model_args_pretrained=mlfoundations-dev__gemma-simpo-reproduction,67.35102937013792,1.4210070002869848,557,247,1,805,69.25465838509317,community,1950,71.18995900084634,0.5756949353655318 openpipe-moa-gpt-4-turbo-v1,63.15493451236265,1.422980098799326,515,283,7,805,64.40993788819875,community,1856,68.37866250336802,0.7309418614587613 gemma-2-9b-it-DPO,65.35922380122982,1.402802336467638,536,268,1,805,66.64596273291924,community,2016,67.6620382198043,0.6605613085864308 Together-MoA,59.8688062333292,1.434305604543079,490,314,1,805,60.93167701863354,community,1825,65.37996976852163,0.7392392836781445 @@ -23,7 +22,7 @@ gpt4_1106_preview_verbose,64.30360147101865,1.3348590089025316,525,268,12,805,65 gpt-4o-mini-2024-07-18,44.65413862507926,1.4572395578449813,350,451,4,805,43.72670807453416,minimal,1861,50.727144855901976,0.8284734951761676 Storm-7B,50.26886905528583,1.4728176780737183,397,408,0,805,49.31677018633541,community,2045,50.45110959343775, gpt4_1106_preview,50.0,0.0,0,0,805,805,50.0,minimal,2049,50.0, -REBEL-Llama-3-8B-Instruct-Armo,48.43655307668638,1.480341435123528,394,410,1,805,49.00621118012423,community,1965,49.31429353685712,0.7061879308002301 +REBEL-Llama-3-8B-Instruct-Armo,48.43655307668638,1.480341435123528,394,410,1,805,49.006211180124225,community,1965,49.314293536857114,0.7061879308002301 Infinity-Instruct-7M-Gen-Llama3_1-70B,37.46327383827497,1.4734130373862548,299,501,5,805,37.453416149068325,community,1654,46.10043331712677,0.822439983375277 Llama-3-Instruct-8B-SimPO-ExPO,40.63285400856655,1.4439449942168028,325,479,1,805,40.43478260869565,community,1765,45.78021783946177, Llama-3-Instruct-8B-SimPO,40.52977498461182,1.422574464675002,319,485,1,805,39.68944099378882,community,1825,44.65131348921881,0.8800655791760451 @@ -209,5 +208,4 @@ oasst-sft-pythia-12b,1.790114083180124,0.3985580883049341,13,790,2,805,1.7391304 guanaco-13b,3.469596859739131,0.5518606725700214,22,780,3,805,2.919254658385093,verified,1774,3.003787329611614, guanaco-7b,2.880002266173913,0.5202924149314048,21,783,1,805,2.670807453416149,verified,1364,2.871116813131697, Qwen1.5-1.8B-Chat,3.70555681579365,0.5811750995496215,27,774,3,804,3.544776119402985,verified,2673,2.588498849185137, -baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568, -model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,0.005260368511326853,0.0018774672393365112,0,805,0,805,0.0,community,196,0.010252829751292214,0.0007495965900756891 +baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568, \ No newline at end of file diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv index 7d6fa028..621ffd75 100644 --- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv +++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv @@ -187,4 +187,4 @@ Shopee-SlimMoA-v1,-0.6930943742294789,0.5778443790027642,1.4506276222723822 blendaxai-gm-l6-vo31,-1.4827230167114802,0.8256378421072179,1.5942312525409852 REBEL-Llama-3-8B-Instruct-Armo,-1.0427168605260002,0.6464073051877255,0.0395191056877229 model_hf_model_args_pretrained=mlfoundations-dev__gemma-simpo-reproduction,-1.1818376919023723,0.6835318362039150,1.1479555832649320 -model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,-1.8345282763259563,0.7434213717748921,-9.8937244442602008 +model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,-1.8345282763259563,0.7434213717748921,-9.8937244442602008 \ No newline at end of file diff --git a/eval/chat_benchmarks/zeroeval/eval_instruct.py b/eval/chat_benchmarks/zeroeval/eval_instruct.py index e5e73c3e..2186b79e 100644 --- a/eval/chat_benchmarks/zeroeval/eval_instruct.py +++ b/eval/chat_benchmarks/zeroeval/eval_instruct.py @@ -144,7 +144,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: outputs = self.compute(model, all_instances) - if model.accelerator.process_index != 0: + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 + if not is_main_process: continue outputs = [[output] for output in outputs] diff --git a/eval/eval.py b/eval/eval.py index 2c0d5682..c806be5e 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -5,9 +5,11 @@ import sys import time from typing import Optional, List, Dict, Union +from pathlib import Path import concurrent.futures import torch.distributed as dist +from huggingface_hub import snapshot_download from lm_eval import utils from lm_eval import evaluator as pretrain_evaluator @@ -26,6 +28,103 @@ from eval.eval_tracker import DCEvaluationTracker +class ModelInitializer: + """Handles model initialization for distributed evaluations.""" + + def __init__(self, cache_dir: Optional[str] = None): + self.cache_dir = cache_dir or os.getenv('HF_HOME', + os.path.join(os.path.expanduser("~"), ".cache", "huggingface")) + self._ensure_directory(self.cache_dir) + + def _ensure_directory(self, path: str) -> None: + """Safely create directory if it doesn't exist.""" + Path(path).mkdir(parents=True, exist_ok=True) + + def download_model(self, model_id: str) -> None: + """Download model files with proper error handling.""" + try: + snapshot_download( + repo_id=model_id, + cache_dir=self.cache_dir, + local_files_only=False, + resume_download=True + ) + except Exception as e: + raise RuntimeError(f"Failed to download model {model_id}: {str(e)}") + + +def initialize_model_for_eval( + model: Union[str, LM], + model_args: Optional[str] = None, + batch_size: int = None, + max_batch_size: Optional[int] = None, + device: Optional[str] = None, + cache_dir: Optional[str] = None +) -> LM: + """ + Initialize model for distributed evaluation where each node runs independent evaluations. + + Args: + model (Union[str, LM]): + Either a string identifier for the model to load from registry, + or an already instantiated LM object. + model_args (Optional[str], optional): + Additional arguments for model initialization as a string. + Only used if model is provided as a string. Defaults to None. + batch_size (Optional[int], optional): + Batch size for model inference. Defaults to None. + max_batch_size (Optional[int], optional): + Maximum allowed batch size. Defaults to None. + device (Optional[str], optional): + Device to load the model on (e.g., 'cuda', 'cpu'). Defaults to None. + + Returns: + LM: + Initialized language model instance with configured parameters + and a sanitized model identifier. + """ + local_rank = int(os.getenv('LOCAL_RANK', '0')) + + if isinstance(model, str): + initializer = ModelInitializer(cache_dir) + + try: + initializer.download_model(model) + except Exception as e: + print(f"Rank {local_rank} failed to initialize model: {str(e)}") + if dist.is_initialized(): + dist.barrier() # Ensure all ranks fail together + raise e + + if dist.is_initialized(): + dist.barrier() + + if model_args is None: + model_args = "" + + config = { + "batch_size": batch_size, + "max_batch_size": max_batch_size, + "device": device, + } + + try: + lm = lm_eval.api.registry.get_model(model).create_from_arg_string( + model_args, + config, + ) + except Exception as e: + print(f"Rank {local_rank} failed to create model: {str(e)}") + if dist.is_initialized(): + dist.barrier() + raise e + else: + lm = model + + lm.model_identifier = sanitize_model_name(f"model_{model}_model_args_{model_args}") + return lm + + def setup_custom_parser(): """ Create a custom argument parser that extends lm-eval-harness parser. @@ -302,7 +401,8 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: ) # Add metadata to results - if lm.accelerator.process_index == 0: + is_main_process = lm.accelerator.process_index == 0 if hasattr(lm, 'accelerator') else lm.world_size <= 1 + if is_main_process: add_results_metadata(results, args, lm) handle_evaluation_output(results, args, evaluation_tracker, wandb_logger) From 70964fcea16777e9faaec53fdd2d60f6e11aefd0 Mon Sep 17 00:00:00 2001 From: Negin Raoof Date: Thu, 21 Nov 2024 21:01:47 -0600 Subject: [PATCH 3/3] fix repobench --- eval/chat_benchmarks/HumanEval/eval_instruct.py | 2 +- eval/chat_benchmarks/RepoBench/eval_instruct.py | 5 +++-- .../weighted_alpaca_eval_gpt4_turbo_leaderboard.csv | 2 +- .../length_controlled_v1/baseline_gpt4_1106_preview.csv | 2 -- eval/eval.py | 2 +- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/eval/chat_benchmarks/HumanEval/eval_instruct.py b/eval/chat_benchmarks/HumanEval/eval_instruct.py index 85d5f066..7ddc3470 100644 --- a/eval/chat_benchmarks/HumanEval/eval_instruct.py +++ b/eval/chat_benchmarks/HumanEval/eval_instruct.py @@ -112,7 +112,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: self.logger.info("Generating responses for Human Eval...") outputs = self.compute(model, all_instances) - is_main_process = lm.accelerator.process_index == 0 if hasattr(lm, 'accelerator') else lm.world_size <= 1 + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 if not is_main_process: continue diff --git a/eval/chat_benchmarks/RepoBench/eval_instruct.py b/eval/chat_benchmarks/RepoBench/eval_instruct.py index ac89a691..f6760ee6 100644 --- a/eval/chat_benchmarks/RepoBench/eval_instruct.py +++ b/eval/chat_benchmarks/RepoBench/eval_instruct.py @@ -137,6 +137,7 @@ def _generate_responses_legacy(self, model: LM) -> Dict[str, Any]: temp_dir_obj = tempfile.TemporaryDirectory() temp_dir = temp_dir_obj.name + is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1 for lang in self.languages: for subset in self.subsets: dataset = load_data(split="test", task="completion", language=lang, length="2k", setting=subset) @@ -163,8 +164,8 @@ def _generate_responses_legacy(self, model: LM) -> Dict[str, Any]: ) outputs = self.compute(model, all_instances, do_slice=False) - - if model.accelerator.process_index != 0: + + if not is_main_process: continue generated_examples = [] diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv index 185f33c5..25a5c698 100644 --- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv +++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv @@ -208,4 +208,4 @@ oasst-sft-pythia-12b,1.790114083180124,0.3985580883049341,13,790,2,805,1.7391304 guanaco-13b,3.469596859739131,0.5518606725700214,22,780,3,805,2.919254658385093,verified,1774,3.003787329611614, guanaco-7b,2.880002266173913,0.5202924149314048,21,783,1,805,2.670807453416149,verified,1364,2.871116813131697, Qwen1.5-1.8B-Chat,3.70555681579365,0.5811750995496215,27,774,3,804,3.544776119402985,verified,2673,2.588498849185137, -baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568, \ No newline at end of file +baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568, diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv index 621ffd75..ad2e31af 100644 --- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv +++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv @@ -186,5 +186,3 @@ Mistral-7B-Instruct-v0.3,-1.5007159011881868,0.9845683091847074,-1.7652759895328 Shopee-SlimMoA-v1,-0.6930943742294789,0.5778443790027642,1.4506276222723822 blendaxai-gm-l6-vo31,-1.4827230167114802,0.8256378421072179,1.5942312525409852 REBEL-Llama-3-8B-Instruct-Armo,-1.0427168605260002,0.6464073051877255,0.0395191056877229 -model_hf_model_args_pretrained=mlfoundations-dev__gemma-simpo-reproduction,-1.1818376919023723,0.6835318362039150,1.1479555832649320 -model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,-1.8345282763259563,0.7434213717748921,-9.8937244442602008 \ No newline at end of file diff --git a/eval/eval.py b/eval/eval.py index c806be5e..6f5fad43 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -247,7 +247,7 @@ def evaluate( cpu_count = os.cpu_count() max_workers = min(len(valid_tasks), cpu_count * 2) - if lm.world_size <= 1 or lm.accelerator.process_index == 0: + if (hasattr(lm, 'accelerator') and lm.accelerator.process_index == 0) or lm.world_size <= 1 or : with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: evaluate_results = list( executor.map(