From 2d96e47f059e3b7d92f1573359a57adfa4b787b0 Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Tue, 19 Nov 2024 10:09:04 -0800
Subject: [PATCH 1/3] change slicing in compute, change rank to
 accelerator.process_index

---
 eval/chat_benchmarks/HumanEval/eval_instruct.py  |  2 +-
 eval/chat_benchmarks/IFEval/eval_instruct.py     |  2 +-
 eval/chat_benchmarks/MBPP/eval_instruct.py       |  2 +-
 eval/chat_benchmarks/MTBench/eval_instruct.py    |  2 +-
 .../fastchat/modules/xfastertransformer.py       |  2 +-
 eval/chat_benchmarks/MixEval/eval_instruct.py    |  8 ++++----
 eval/chat_benchmarks/RepoBench/eval_instruct.py  | 15 +++++++++------
 eval/chat_benchmarks/WildBench/eval_instruct.py  |  2 +-
 .../chat_benchmarks/alpaca_eval/eval_instruct.py |  2 +-
 ...ighted_alpaca_eval_gpt4_turbo_leaderboard.csv |  8 +++++---
 .../baseline_gpt4_1106_preview.csv               |  2 ++
 eval/chat_benchmarks/zeroeval/eval_instruct.py   |  2 +-
 eval/eval.py                                     |  4 ++--
 eval/task.py                                     | 16 ++++++++--------
 14 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/eval/chat_benchmarks/HumanEval/eval_instruct.py b/eval/chat_benchmarks/HumanEval/eval_instruct.py
index ad705099..17149ef6 100644
--- a/eval/chat_benchmarks/HumanEval/eval_instruct.py
+++ b/eval/chat_benchmarks/HumanEval/eval_instruct.py
@@ -112,7 +112,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 self.logger.info("Generating responses for Human Eval...")
                 outputs = self.compute(model, all_instances)
 
-                if model.rank != 0:
+                if model.accelerator.process_index != 0:
                     continue
 
                 generated_examples = []
diff --git a/eval/chat_benchmarks/IFEval/eval_instruct.py b/eval/chat_benchmarks/IFEval/eval_instruct.py
index 7e500dbe..6f5f0b7d 100644
--- a/eval/chat_benchmarks/IFEval/eval_instruct.py
+++ b/eval/chat_benchmarks/IFEval/eval_instruct.py
@@ -115,7 +115,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             self.logger.info("Generating responses...")
             outputs = self.compute(model, all_instances)
 
-            if model.rank != 0:
+            if model.accelerator.process_index != 0:
                 return None
 
             generated_examples = []
diff --git a/eval/chat_benchmarks/MBPP/eval_instruct.py b/eval/chat_benchmarks/MBPP/eval_instruct.py
index 875c8bac..eff2af9d 100644
--- a/eval/chat_benchmarks/MBPP/eval_instruct.py
+++ b/eval/chat_benchmarks/MBPP/eval_instruct.py
@@ -161,7 +161,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             outputs = self.compute(model, all_instances)
 
             # Return None early for non-primary ranks
-            if model.rank != 0:
+            if model.accelerator.process_index != 0:
                 return None
 
             generated_examples = []
diff --git a/eval/chat_benchmarks/MTBench/eval_instruct.py b/eval/chat_benchmarks/MTBench/eval_instruct.py
index 202662ac..b35a58c0 100644
--- a/eval/chat_benchmarks/MTBench/eval_instruct.py
+++ b/eval/chat_benchmarks/MTBench/eval_instruct.py
@@ -151,7 +151,7 @@ def get_model_answers(self, model: LM, model_id: str, questions: List[Dict[str,
                     all_convs[q_idx].append({"role": "assistant", "content": output})
                     all_choices[q_idx]["turns"].append(output)
 
-            if model.rank != 0:
+            if model.accelerator.process_index != 0:
                 continue
 
             # Save completed conversations
diff --git a/eval/chat_benchmarks/MTBench/fastchat/modules/xfastertransformer.py b/eval/chat_benchmarks/MTBench/fastchat/modules/xfastertransformer.py
index 8c95d4d7..557ec4c4 100644
--- a/eval/chat_benchmarks/MTBench/fastchat/modules/xfastertransformer.py
+++ b/eval/chat_benchmarks/MTBench/fastchat/modules/xfastertransformer.py
@@ -36,7 +36,7 @@ def load_xft_model(model_path, xft_config: XftConfig):
     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, padding_side="left", trust_remote_code=True)
     xft_model = xfastertransformer.AutoModel.from_pretrained(model_path, dtype=data_type)
     model = XftModel(xft_model=xft_model, xft_config=xft_config)
-    if model.model.rank > 0:
+    if model.model.accelerator.process_index > 0:
         while True:
             model.model.generate()
     return model, tokenizer
diff --git a/eval/chat_benchmarks/MixEval/eval_instruct.py b/eval/chat_benchmarks/MixEval/eval_instruct.py
index d92c7e8e..ff56f47b 100644
--- a/eval/chat_benchmarks/MixEval/eval_instruct.py
+++ b/eval/chat_benchmarks/MixEval/eval_instruct.py
@@ -135,7 +135,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
         for split in splits:
             self.args.split = split
             all_results = self._eval_split(model, split)
-            if model.rank == 0:
+            if model.accelerator.process_index == 0:
                 response_file = self._get_response_file()
                 with open(response_file, "w") as f:
                     for result in all_results:
@@ -143,7 +143,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             out_dict[split] = all_results
 
         # Only return results on rank 0
-        if model.world_size > 1 and model.rank != 0:
+        if model.world_size > 1 and model.accelerator.process_index != 0:
             return None
         return out_dict
 
@@ -192,7 +192,7 @@ def _eval_split(self, model: LM, split: str) -> List[Dict[str, Any]]:
         for idx in list(range(len(eval_dataset.raw_inputs))):
             eval_dataset.raw_inputs[idx]["response"] = all_responses[idx]
 
-        if model.rank == 0:
+        if model.accelerator.process_index == 0:
             with open(response_file, "w") as f:
                 for item in eval_dataset.raw_inputs:
                     json_line = json.dumps(item)
@@ -243,7 +243,7 @@ def run_benchmark(self, model: LM) -> Dict[str, Any]:
         generation_results = self.generate_responses(model)
 
         # Only evaluate on rank 0
-        if model.world_size > 1 and model.rank != 0:
+        if model.world_size > 1 and model.accelerator.process_index != 0:
             return None
 
         evaluation_results = self.evaluate_responses(generation_results)
diff --git a/eval/chat_benchmarks/RepoBench/eval_instruct.py b/eval/chat_benchmarks/RepoBench/eval_instruct.py
index cb207c10..cc2251ce 100644
--- a/eval/chat_benchmarks/RepoBench/eval_instruct.py
+++ b/eval/chat_benchmarks/RepoBench/eval_instruct.py
@@ -59,7 +59,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
         if self.legacy_mode:
             return self._generate_responses_legacy(model)
 
-        if model.rank == 0:
+        if model.accelerator.process_index == 0:
             temp_dir_obj = tempfile.TemporaryDirectory()
             temp_dir = temp_dir_obj.name
 
@@ -76,8 +76,11 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
 
                 all_instances = []
                 # Split dataset across ranks for parallel construction
-                # Get subset of dataset for this rank using built-in slice functionality
-                rank_dataset = list(islice(dataset, model.rank, len(dataset), model.world_size))
+                # Get subset of dataset for this rank using the same slicing strategy as the compute function
+                chunk_size = len(dataset) // model.world_size
+                start = model.accelerator.process_index * chunk_size
+                end = start + chunk_size if model.accelerator.process_index < model.world_size - 1 else len(dataset)
+                rank_dataset = dataset.select(range(start, end))
 
                 # Process examples for this rank's shard
                 for idx, example in enumerate(rank_dataset):
@@ -100,7 +103,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 outputs = self.compute(model, all_instances, do_slice=False)
 
                 # Only rank 0 should save the results
-                if model.rank != 0:
+                if model.accelerator.process_indexlerator.process_index != 0:
                     continue
 
                 generated_examples = []
@@ -118,7 +121,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                     for ex in generated_examples:
                         fw.write(json.dumps(ex) + "\n")
 
-        if model.rank == 0:
+        if model.accelerator.process_index == 0:
             return {"temp_dir_obj": temp_dir_obj}
 
     def _generate_responses_legacy(self, model: LM) -> Dict[str, Any]:
@@ -156,7 +159,7 @@ def _generate_responses_legacy(self, model: LM) -> Dict[str, Any]:
 
                 outputs = self.compute(model, all_instances, do_slice=False)
 
-                if model.rank != 0:
+                if model.accelerator.process_index != 0:
                     continue
 
                 generated_examples = []
diff --git a/eval/chat_benchmarks/WildBench/eval_instruct.py b/eval/chat_benchmarks/WildBench/eval_instruct.py
index 100e6d68..f909b206 100644
--- a/eval/chat_benchmarks/WildBench/eval_instruct.py
+++ b/eval/chat_benchmarks/WildBench/eval_instruct.py
@@ -196,7 +196,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             outputs = self.compute(model, all_instances)
 
             # Return None early for non-primary ranks
-            if model.rank != 0:
+            if model.accelerator.process_index != 0:
                 return None
 
             outputs = [[output] for output in outputs]
diff --git a/eval/chat_benchmarks/alpaca_eval/eval_instruct.py b/eval/chat_benchmarks/alpaca_eval/eval_instruct.py
index 0a79b1e8..f1d6170b 100644
--- a/eval/chat_benchmarks/alpaca_eval/eval_instruct.py
+++ b/eval/chat_benchmarks/alpaca_eval/eval_instruct.py
@@ -117,7 +117,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 self.logger.info("Generating responses for Alpaca Eval...")
                 outputs = self.compute(model, all_instances)
 
-            if model.rank != 0:
+            if model.accelerator.process_index != 0:
                 return None
 
             model_outputs = []
diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
index 25a5c698..01881528 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
@@ -1,9 +1,10 @@
 ,win_rate,standard_error,n_wins,n_wins_base,n_draws,n_total,discrete_win_rate,mode,avg_length,length_controlled_winrate,lc_standard_error
-Shopee-SlimMoA-v1,75.61428659805350,1.2706274059194700,621,184,0,805,77.14285714285720,community,1994,77.4515432873834,0.43017522149239600
-blendaxai-gm-l6-vo31,69.11033492869565,1.3280735654354863,562,242,1,805,69.87577639751554,community,1809,76.91981221023656,0.5725365663132986
+Shopee-SlimMoA-v1,75.6142865980535,1.27062740591947,621,184,0,805,77.1428571428572,community,1994,77.4515432873834,0.430175221492396
+blendaxai-gm-l6-vo31,69.11033492869565,1.3280735654354865,562,242,1,805,69.87577639751554,community,1809,76.91981221023656,0.5725365663132986
 gemma-2-9b-it-WPO-HB,77.82503168985093,1.2355857177790277,640,163,2,805,79.62732919254658,community,2285,76.72506842726064,0.4242603928637889
 blendaxai-gm-l3-v35,73.41035740244067,1.254951147343878,607,196,2,805,75.527950310559,community,2186,73.37270365010379,0.6163911450738288
 gemma-2-9b-it-SimPO,65.86422561532919,1.423459922555078,540,264,1,805,67.14285714285714,community,1833,72.3508446939842,0.5167873784867067
+model_hf_model_args_pretrained=mlfoundations-dev__gemma-simpo-reproduction,67.35102937013792,1.4210070002869848,557,247,1,805,69.25465838509317,community,1950,71.18995900084634,0.5756949353655318
 openpipe-moa-gpt-4-turbo-v1,63.15493451236265,1.422980098799326,515,283,7,805,64.40993788819875,community,1856,68.37866250336802,0.7309418614587613
 gemma-2-9b-it-DPO,65.35922380122982,1.402802336467638,536,268,1,805,66.64596273291924,community,2016,67.6620382198043,0.6605613085864308
 Together-MoA,59.8688062333292,1.434305604543079,490,314,1,805,60.93167701863354,community,1825,65.37996976852163,0.7392392836781445
@@ -22,7 +23,7 @@ gpt4_1106_preview_verbose,64.30360147101865,1.3348590089025316,525,268,12,805,65
 gpt-4o-mini-2024-07-18,44.65413862507926,1.4572395578449813,350,451,4,805,43.72670807453416,minimal,1861,50.727144855901976,0.8284734951761676
 Storm-7B,50.26886905528583,1.4728176780737183,397,408,0,805,49.31677018633541,community,2045,50.45110959343775,
 gpt4_1106_preview,50.0,0.0,0,0,805,805,50.0,minimal,2049,50.0,
-REBEL-Llama-3-8B-Instruct-Armo,48.43655307668638,1.480341435123528,394,410,1,805,49.006211180124225,community,1965,49.314293536857114,0.7061879308002301
+REBEL-Llama-3-8B-Instruct-Armo,48.43655307668638,1.480341435123528,394,410,1,805,49.00621118012423,community,1965,49.31429353685712,0.7061879308002301
 Infinity-Instruct-7M-Gen-Llama3_1-70B,37.46327383827497,1.4734130373862548,299,501,5,805,37.453416149068325,community,1654,46.10043331712677,0.822439983375277
 Llama-3-Instruct-8B-SimPO-ExPO,40.63285400856655,1.4439449942168028,325,479,1,805,40.43478260869565,community,1765,45.78021783946177,
 Llama-3-Instruct-8B-SimPO,40.52977498461182,1.422574464675002,319,485,1,805,39.68944099378882,community,1825,44.65131348921881,0.8800655791760451
@@ -209,3 +210,4 @@ guanaco-13b,3.469596859739131,0.5518606725700214,22,780,3,805,2.919254658385093,
 guanaco-7b,2.880002266173913,0.5202924149314048,21,783,1,805,2.670807453416149,verified,1364,2.871116813131697,
 Qwen1.5-1.8B-Chat,3.70555681579365,0.5811750995496215,27,774,3,804,3.544776119402985,verified,2673,2.588498849185137,
 baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568,
+model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,0.005260368511326853,0.0018774672393365112,0,805,0,805,0.0,community,196,0.010252829751292214,0.0007495965900756891
diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv
index ad2e31af..7d6fa028 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv
@@ -186,3 +186,5 @@ Mistral-7B-Instruct-v0.3,-1.5007159011881868,0.9845683091847074,-1.7652759895328
 Shopee-SlimMoA-v1,-0.6930943742294789,0.5778443790027642,1.4506276222723822
 blendaxai-gm-l6-vo31,-1.4827230167114802,0.8256378421072179,1.5942312525409852
 REBEL-Llama-3-8B-Instruct-Armo,-1.0427168605260002,0.6464073051877255,0.0395191056877229
+model_hf_model_args_pretrained=mlfoundations-dev__gemma-simpo-reproduction,-1.1818376919023723,0.6835318362039150,1.1479555832649320
+model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,-1.8345282763259563,0.7434213717748921,-9.8937244442602008
diff --git a/eval/chat_benchmarks/zeroeval/eval_instruct.py b/eval/chat_benchmarks/zeroeval/eval_instruct.py
index c68a4379..e5e73c3e 100644
--- a/eval/chat_benchmarks/zeroeval/eval_instruct.py
+++ b/eval/chat_benchmarks/zeroeval/eval_instruct.py
@@ -144,7 +144,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
 
             outputs = self.compute(model, all_instances)
 
-            if model.rank != 0:
+            if model.accelerator.process_index != 0:
                 continue
 
             outputs = [[output] for output in outputs]
diff --git a/eval/eval.py b/eval/eval.py
index 5bd177cf..2c0d5682 100644
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -148,7 +148,7 @@ def evaluate(
         cpu_count = os.cpu_count()
 
         max_workers = min(len(valid_tasks), cpu_count * 2)
-        if lm.world_size <= 1 or lm.rank == 0:
+        if lm.world_size <= 1 or lm.accelerator.process_index == 0:
             with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                 evaluate_results = list(
                     executor.map(
@@ -302,7 +302,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None:
     )
 
     # Add metadata to results
-    if lm.rank == 0:
+    if lm.accelerator.process_index == 0:
         add_results_metadata(results, args, lm)
         handle_evaluation_output(results, args, evaluation_tracker, wandb_logger)
 
diff --git a/eval/task.py b/eval/task.py
index aec96eca..337dd6d4 100644
--- a/eval/task.py
+++ b/eval/task.py
@@ -20,23 +20,23 @@ def __init__(self, logger: Optional[logging.Logger] = None):
 
     def compute(self, model: LM, inputs: List[Instance], do_slice: bool = True) -> List[str]:
         if model.world_size > 1 and do_slice:
-            prompts = list(islice(inputs, model.rank, len(inputs), model.world_size))
+            chunk_size = len(inputs) // model.world_size
+            start = model.accelerator.process_index * chunk_size
+            end = start + chunk_size if model.accelerator.process_index < model.world_size - 1 else len(inputs)
+            prompts = inputs[start:end]
         else:
             prompts = inputs
 
         results = model.generate_until(prompts)
         if model.world_size > 1:
             all_results = [None for _ in range(model.world_size)]
-
             dist.all_gather_object(all_results, results)
 
-            # Merge results from all ranks
-            length = sum(len(res) for res in all_results if res is not None)
-            merged = [None] * length
-            for rank, sub_results in enumerate(all_results):
+            # Simply concatenate results in rank order
+            merged = []
+            for sub_results in all_results:
                 if sub_results is not None:
-                    for i, item in enumerate(sub_results):
-                        merged[i * model.world_size + rank] = item
+                    merged.extend(sub_results)
             return merged
         else:
             return results

From aa7b45189bb411eb305ce3ae828fc1e8066b9838 Mon Sep 17 00:00:00 2001
From: Negin Raoof <neginmr@utexas.edu>
Date: Thu, 21 Nov 2024 20:53:54 -0600
Subject: [PATCH 2/3] fix for single process

---
 .../HumanEval/eval_instruct.py                |   3 +-
 eval/chat_benchmarks/IFEval/eval_instruct.py  |   3 +-
 eval/chat_benchmarks/MBPP/eval_instruct.py    |   3 +-
 eval/chat_benchmarks/MTBench/eval_instruct.py |   3 +-
 eval/chat_benchmarks/MixEval/eval_instruct.py |  12 ++-
 .../RepoBench/eval_instruct.py                |  19 ++--
 .../WildBench/eval_instruct.py                |   3 +-
 .../alpaca_eval/eval_instruct.py              |   3 +-
 ...ted_alpaca_eval_gpt4_turbo_leaderboard.csv |  10 +-
 .../baseline_gpt4_1106_preview.csv            |   2 +-
 .../chat_benchmarks/zeroeval/eval_instruct.py |   3 +-
 eval/eval.py                                  | 102 +++++++++++++++++-
 12 files changed, 140 insertions(+), 26 deletions(-)

diff --git a/eval/chat_benchmarks/HumanEval/eval_instruct.py b/eval/chat_benchmarks/HumanEval/eval_instruct.py
index 17149ef6..85d5f066 100644
--- a/eval/chat_benchmarks/HumanEval/eval_instruct.py
+++ b/eval/chat_benchmarks/HumanEval/eval_instruct.py
@@ -112,7 +112,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 self.logger.info("Generating responses for Human Eval...")
                 outputs = self.compute(model, all_instances)
 
-                if model.accelerator.process_index != 0:
+                is_main_process = lm.accelerator.process_index == 0 if hasattr(lm, 'accelerator') else lm.world_size <= 1
+                if not is_main_process:
                     continue
 
                 generated_examples = []
diff --git a/eval/chat_benchmarks/IFEval/eval_instruct.py b/eval/chat_benchmarks/IFEval/eval_instruct.py
index 6f5f0b7d..5c7b1d8e 100644
--- a/eval/chat_benchmarks/IFEval/eval_instruct.py
+++ b/eval/chat_benchmarks/IFEval/eval_instruct.py
@@ -115,7 +115,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             self.logger.info("Generating responses...")
             outputs = self.compute(model, all_instances)
 
-            if model.accelerator.process_index != 0:
+            is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
+            if not is_main_process:
                 return None
 
             generated_examples = []
diff --git a/eval/chat_benchmarks/MBPP/eval_instruct.py b/eval/chat_benchmarks/MBPP/eval_instruct.py
index eff2af9d..6d589bf6 100644
--- a/eval/chat_benchmarks/MBPP/eval_instruct.py
+++ b/eval/chat_benchmarks/MBPP/eval_instruct.py
@@ -161,7 +161,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             outputs = self.compute(model, all_instances)
 
             # Return None early for non-primary ranks
-            if model.accelerator.process_index != 0:
+            is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
+            if not is_main_process:
                 return None
 
             generated_examples = []
diff --git a/eval/chat_benchmarks/MTBench/eval_instruct.py b/eval/chat_benchmarks/MTBench/eval_instruct.py
index b35a58c0..7a61f238 100644
--- a/eval/chat_benchmarks/MTBench/eval_instruct.py
+++ b/eval/chat_benchmarks/MTBench/eval_instruct.py
@@ -151,7 +151,8 @@ def get_model_answers(self, model: LM, model_id: str, questions: List[Dict[str,
                     all_convs[q_idx].append({"role": "assistant", "content": output})
                     all_choices[q_idx]["turns"].append(output)
 
-            if model.accelerator.process_index != 0:
+            is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
+            if not is_main_process:
                 continue
 
             # Save completed conversations
diff --git a/eval/chat_benchmarks/MixEval/eval_instruct.py b/eval/chat_benchmarks/MixEval/eval_instruct.py
index ff56f47b..f15f2b52 100644
--- a/eval/chat_benchmarks/MixEval/eval_instruct.py
+++ b/eval/chat_benchmarks/MixEval/eval_instruct.py
@@ -132,10 +132,12 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
         out_dict = {}
 
         self.logger.info("Generating responses for MixEval...")
+        is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
+
         for split in splits:
             self.args.split = split
             all_results = self._eval_split(model, split)
-            if model.accelerator.process_index == 0:
+            if is_main_process:
                 response_file = self._get_response_file()
                 with open(response_file, "w") as f:
                     for result in all_results:
@@ -143,7 +145,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             out_dict[split] = all_results
 
         # Only return results on rank 0
-        if model.world_size > 1 and model.accelerator.process_index != 0:
+        if not is_main_process:
             return None
         return out_dict
 
@@ -192,7 +194,8 @@ def _eval_split(self, model: LM, split: str) -> List[Dict[str, Any]]:
         for idx in list(range(len(eval_dataset.raw_inputs))):
             eval_dataset.raw_inputs[idx]["response"] = all_responses[idx]
 
-        if model.accelerator.process_index == 0:
+        is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
+        if is_main_process:
             with open(response_file, "w") as f:
                 for item in eval_dataset.raw_inputs:
                     json_line = json.dumps(item)
@@ -243,7 +246,8 @@ def run_benchmark(self, model: LM) -> Dict[str, Any]:
         generation_results = self.generate_responses(model)
 
         # Only evaluate on rank 0
-        if model.world_size > 1 and model.accelerator.process_index != 0:
+        is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
+        if not is_main_process:
             return None
 
         evaluation_results = self.evaluate_responses(generation_results)
diff --git a/eval/chat_benchmarks/RepoBench/eval_instruct.py b/eval/chat_benchmarks/RepoBench/eval_instruct.py
index cc2251ce..ac89a691 100644
--- a/eval/chat_benchmarks/RepoBench/eval_instruct.py
+++ b/eval/chat_benchmarks/RepoBench/eval_instruct.py
@@ -59,7 +59,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
         if self.legacy_mode:
             return self._generate_responses_legacy(model)
 
-        if model.accelerator.process_index == 0:
+        is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
+        if is_main_process:
             temp_dir_obj = tempfile.TemporaryDirectory()
             temp_dir = temp_dir_obj.name
 
@@ -77,10 +78,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 all_instances = []
                 # Split dataset across ranks for parallel construction
                 # Get subset of dataset for this rank using the same slicing strategy as the compute function
-                chunk_size = len(dataset) // model.world_size
-                start = model.accelerator.process_index * chunk_size
-                end = start + chunk_size if model.accelerator.process_index < model.world_size - 1 else len(dataset)
-                rank_dataset = dataset.select(range(start, end))
+                if hasattr(model, 'accelerator'):
+                    chunk_size = len(dataset) // model.world_size
+                    start = model.accelerator.process_index * chunk_size
+                    end = start + chunk_size if model.accelerator.process_index < model.world_size - 1 else len(dataset)
+                    rank_dataset = dataset.select(range(start, end))
+                else:
+                    rank_dataset = list(islice(dataset, model.rank, len(dataset), model.world_size))
 
                 # Process examples for this rank's shard
                 for idx, example in enumerate(rank_dataset):
@@ -103,7 +107,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 outputs = self.compute(model, all_instances, do_slice=False)
 
                 # Only rank 0 should save the results
-                if model.accelerator.process_indexlerator.process_index != 0:
+                is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
+                if not is_main_process:
                     continue
 
                 generated_examples = []
@@ -121,7 +126,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                     for ex in generated_examples:
                         fw.write(json.dumps(ex) + "\n")
 
-        if model.accelerator.process_index == 0:
+        if is_main_process:
             return {"temp_dir_obj": temp_dir_obj}
 
     def _generate_responses_legacy(self, model: LM) -> Dict[str, Any]:
diff --git a/eval/chat_benchmarks/WildBench/eval_instruct.py b/eval/chat_benchmarks/WildBench/eval_instruct.py
index f909b206..2ff69cea 100644
--- a/eval/chat_benchmarks/WildBench/eval_instruct.py
+++ b/eval/chat_benchmarks/WildBench/eval_instruct.py
@@ -196,7 +196,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             outputs = self.compute(model, all_instances)
 
             # Return None early for non-primary ranks
-            if model.accelerator.process_index != 0:
+            is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
+            if not is_main_process:
                 return None
 
             outputs = [[output] for output in outputs]
diff --git a/eval/chat_benchmarks/alpaca_eval/eval_instruct.py b/eval/chat_benchmarks/alpaca_eval/eval_instruct.py
index f1d6170b..09f86da4 100644
--- a/eval/chat_benchmarks/alpaca_eval/eval_instruct.py
+++ b/eval/chat_benchmarks/alpaca_eval/eval_instruct.py
@@ -117,7 +117,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 self.logger.info("Generating responses for Alpaca Eval...")
                 outputs = self.compute(model, all_instances)
 
-            if model.accelerator.process_index != 0:
+            is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
+            if not is_main_process:
                 return None
 
             model_outputs = []
diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
index 01881528..185f33c5 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
@@ -1,10 +1,9 @@
 ,win_rate,standard_error,n_wins,n_wins_base,n_draws,n_total,discrete_win_rate,mode,avg_length,length_controlled_winrate,lc_standard_error
-Shopee-SlimMoA-v1,75.6142865980535,1.27062740591947,621,184,0,805,77.1428571428572,community,1994,77.4515432873834,0.430175221492396
-blendaxai-gm-l6-vo31,69.11033492869565,1.3280735654354865,562,242,1,805,69.87577639751554,community,1809,76.91981221023656,0.5725365663132986
+Shopee-SlimMoA-v1,75.61428659805350,1.2706274059194700,621,184,0,805,77.14285714285720,community,1994,77.4515432873834,0.43017522149239600
+blendaxai-gm-l6-vo31,69.11033492869565,1.3280735654354863,562,242,1,805,69.87577639751554,community,1809,76.91981221023656,0.5725365663132986
 gemma-2-9b-it-WPO-HB,77.82503168985093,1.2355857177790277,640,163,2,805,79.62732919254658,community,2285,76.72506842726064,0.4242603928637889
 blendaxai-gm-l3-v35,73.41035740244067,1.254951147343878,607,196,2,805,75.527950310559,community,2186,73.37270365010379,0.6163911450738288
 gemma-2-9b-it-SimPO,65.86422561532919,1.423459922555078,540,264,1,805,67.14285714285714,community,1833,72.3508446939842,0.5167873784867067
-model_hf_model_args_pretrained=mlfoundations-dev__gemma-simpo-reproduction,67.35102937013792,1.4210070002869848,557,247,1,805,69.25465838509317,community,1950,71.18995900084634,0.5756949353655318
 openpipe-moa-gpt-4-turbo-v1,63.15493451236265,1.422980098799326,515,283,7,805,64.40993788819875,community,1856,68.37866250336802,0.7309418614587613
 gemma-2-9b-it-DPO,65.35922380122982,1.402802336467638,536,268,1,805,66.64596273291924,community,2016,67.6620382198043,0.6605613085864308
 Together-MoA,59.8688062333292,1.434305604543079,490,314,1,805,60.93167701863354,community,1825,65.37996976852163,0.7392392836781445
@@ -23,7 +22,7 @@ gpt4_1106_preview_verbose,64.30360147101865,1.3348590089025316,525,268,12,805,65
 gpt-4o-mini-2024-07-18,44.65413862507926,1.4572395578449813,350,451,4,805,43.72670807453416,minimal,1861,50.727144855901976,0.8284734951761676
 Storm-7B,50.26886905528583,1.4728176780737183,397,408,0,805,49.31677018633541,community,2045,50.45110959343775,
 gpt4_1106_preview,50.0,0.0,0,0,805,805,50.0,minimal,2049,50.0,
-REBEL-Llama-3-8B-Instruct-Armo,48.43655307668638,1.480341435123528,394,410,1,805,49.00621118012423,community,1965,49.31429353685712,0.7061879308002301
+REBEL-Llama-3-8B-Instruct-Armo,48.43655307668638,1.480341435123528,394,410,1,805,49.006211180124225,community,1965,49.314293536857114,0.7061879308002301
 Infinity-Instruct-7M-Gen-Llama3_1-70B,37.46327383827497,1.4734130373862548,299,501,5,805,37.453416149068325,community,1654,46.10043331712677,0.822439983375277
 Llama-3-Instruct-8B-SimPO-ExPO,40.63285400856655,1.4439449942168028,325,479,1,805,40.43478260869565,community,1765,45.78021783946177,
 Llama-3-Instruct-8B-SimPO,40.52977498461182,1.422574464675002,319,485,1,805,39.68944099378882,community,1825,44.65131348921881,0.8800655791760451
@@ -209,5 +208,4 @@ oasst-sft-pythia-12b,1.790114083180124,0.3985580883049341,13,790,2,805,1.7391304
 guanaco-13b,3.469596859739131,0.5518606725700214,22,780,3,805,2.919254658385093,verified,1774,3.003787329611614,
 guanaco-7b,2.880002266173913,0.5202924149314048,21,783,1,805,2.670807453416149,verified,1364,2.871116813131697,
 Qwen1.5-1.8B-Chat,3.70555681579365,0.5811750995496215,27,774,3,804,3.544776119402985,verified,2673,2.588498849185137,
-baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568,
-model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,0.005260368511326853,0.0018774672393365112,0,805,0,805,0.0,community,196,0.010252829751292214,0.0007495965900756891
+baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568,
\ No newline at end of file
diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv
index 7d6fa028..621ffd75 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv
@@ -187,4 +187,4 @@ Shopee-SlimMoA-v1,-0.6930943742294789,0.5778443790027642,1.4506276222723822
 blendaxai-gm-l6-vo31,-1.4827230167114802,0.8256378421072179,1.5942312525409852
 REBEL-Llama-3-8B-Instruct-Armo,-1.0427168605260002,0.6464073051877255,0.0395191056877229
 model_hf_model_args_pretrained=mlfoundations-dev__gemma-simpo-reproduction,-1.1818376919023723,0.6835318362039150,1.1479555832649320
-model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,-1.8345282763259563,0.7434213717748921,-9.8937244442602008
+model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,-1.8345282763259563,0.7434213717748921,-9.8937244442602008
\ No newline at end of file
diff --git a/eval/chat_benchmarks/zeroeval/eval_instruct.py b/eval/chat_benchmarks/zeroeval/eval_instruct.py
index e5e73c3e..2186b79e 100644
--- a/eval/chat_benchmarks/zeroeval/eval_instruct.py
+++ b/eval/chat_benchmarks/zeroeval/eval_instruct.py
@@ -144,7 +144,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
 
             outputs = self.compute(model, all_instances)
 
-            if model.accelerator.process_index != 0:
+            is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
+            if not is_main_process:
                 continue
 
             outputs = [[output] for output in outputs]
diff --git a/eval/eval.py b/eval/eval.py
index 2c0d5682..c806be5e 100644
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -5,9 +5,11 @@
 import sys
 import time
 from typing import Optional, List, Dict, Union
+from pathlib import Path
 
 import concurrent.futures
 import torch.distributed as dist
+from huggingface_hub import snapshot_download
 
 from lm_eval import utils
 from lm_eval import evaluator as pretrain_evaluator
@@ -26,6 +28,103 @@
 from eval.eval_tracker import DCEvaluationTracker
 
 
+class ModelInitializer:
+    """Handles model initialization for distributed evaluations."""
+    
+    def __init__(self, cache_dir: Optional[str] = None):
+        self.cache_dir = cache_dir or os.getenv('HF_HOME', 
+            os.path.join(os.path.expanduser("~"), ".cache", "huggingface"))
+        self._ensure_directory(self.cache_dir)
+            
+    def _ensure_directory(self, path: str) -> None:
+        """Safely create directory if it doesn't exist."""
+        Path(path).mkdir(parents=True, exist_ok=True)
+        
+    def download_model(self, model_id: str) -> None:
+        """Download model files with proper error handling."""
+        try:
+            snapshot_download(
+                repo_id=model_id,
+                cache_dir=self.cache_dir,
+                local_files_only=False,
+                resume_download=True
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to download model {model_id}: {str(e)}")
+
+
+def initialize_model_for_eval(
+    model: Union[str, LM],
+    model_args: Optional[str] = None,
+    batch_size: int = None,
+    max_batch_size: Optional[int] = None,
+    device: Optional[str] = None,
+    cache_dir: Optional[str] = None
+) -> LM:
+    """
+    Initialize model for distributed evaluation where each node runs independent evaluations.
+    
+    Args:
+        model (Union[str, LM]):
+            Either a string identifier for the model to load from registry,
+            or an already instantiated LM object.
+        model_args (Optional[str], optional):
+            Additional arguments for model initialization as a string.
+            Only used if model is provided as a string. Defaults to None.
+        batch_size (Optional[int], optional):
+            Batch size for model inference. Defaults to None.
+        max_batch_size (Optional[int], optional):
+            Maximum allowed batch size. Defaults to None.
+        device (Optional[str], optional):
+            Device to load the model on (e.g., 'cuda', 'cpu'). Defaults to None.
+
+    Returns:
+        LM:
+            Initialized language model instance with configured parameters
+            and a sanitized model identifier.
+    """
+    local_rank = int(os.getenv('LOCAL_RANK', '0'))
+    
+    if isinstance(model, str):
+        initializer = ModelInitializer(cache_dir)
+        
+        try:
+            initializer.download_model(model)
+        except Exception as e:
+            print(f"Rank {local_rank} failed to initialize model: {str(e)}")
+            if dist.is_initialized():
+                dist.barrier()  # Ensure all ranks fail together
+            raise e
+            
+        if dist.is_initialized():
+            dist.barrier()
+
+        if model_args is None:
+            model_args = ""
+
+        config = {
+            "batch_size": batch_size,
+            "max_batch_size": max_batch_size,
+            "device": device,
+        }
+
+        try:
+            lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
+                model_args,
+                config,
+            )
+        except Exception as e:
+            print(f"Rank {local_rank} failed to create model: {str(e)}")
+            if dist.is_initialized():
+                dist.barrier()
+            raise e
+    else:
+        lm = model
+
+    lm.model_identifier = sanitize_model_name(f"model_{model}_model_args_{model_args}")
+    return lm
+
+
 def setup_custom_parser():
     """
     Create a custom argument parser that extends lm-eval-harness parser.
@@ -302,7 +401,8 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None:
     )
 
     # Add metadata to results
-    if lm.accelerator.process_index == 0:
+    is_main_process = lm.accelerator.process_index == 0 if hasattr(lm, 'accelerator') else lm.world_size <= 1
+    if is_main_process:
         add_results_metadata(results, args, lm)
         handle_evaluation_output(results, args, evaluation_tracker, wandb_logger)
 

From 70964fcea16777e9faaec53fdd2d60f6e11aefd0 Mon Sep 17 00:00:00 2001
From: Negin Raoof <neginmr@utexas.edu>
Date: Thu, 21 Nov 2024 21:01:47 -0600
Subject: [PATCH 3/3] fix repobench

---
 eval/chat_benchmarks/HumanEval/eval_instruct.py              | 2 +-
 eval/chat_benchmarks/RepoBench/eval_instruct.py              | 5 +++--
 .../weighted_alpaca_eval_gpt4_turbo_leaderboard.csv          | 2 +-
 .../length_controlled_v1/baseline_gpt4_1106_preview.csv      | 2 --
 eval/eval.py                                                 | 2 +-
 5 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/eval/chat_benchmarks/HumanEval/eval_instruct.py b/eval/chat_benchmarks/HumanEval/eval_instruct.py
index 85d5f066..7ddc3470 100644
--- a/eval/chat_benchmarks/HumanEval/eval_instruct.py
+++ b/eval/chat_benchmarks/HumanEval/eval_instruct.py
@@ -112,7 +112,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 self.logger.info("Generating responses for Human Eval...")
                 outputs = self.compute(model, all_instances)
 
-                is_main_process = lm.accelerator.process_index == 0 if hasattr(lm, 'accelerator') else lm.world_size <= 1
+                is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
                 if not is_main_process:
                     continue
 
diff --git a/eval/chat_benchmarks/RepoBench/eval_instruct.py b/eval/chat_benchmarks/RepoBench/eval_instruct.py
index ac89a691..f6760ee6 100644
--- a/eval/chat_benchmarks/RepoBench/eval_instruct.py
+++ b/eval/chat_benchmarks/RepoBench/eval_instruct.py
@@ -137,6 +137,7 @@ def _generate_responses_legacy(self, model: LM) -> Dict[str, Any]:
         temp_dir_obj = tempfile.TemporaryDirectory()
         temp_dir = temp_dir_obj.name
 
+        is_main_process = model.accelerator.process_index == 0 if hasattr(model, 'accelerator') else model.world_size <= 1
         for lang in self.languages:
             for subset in self.subsets:
                 dataset = load_data(split="test", task="completion", language=lang, length="2k", setting=subset)
@@ -163,8 +164,8 @@ def _generate_responses_legacy(self, model: LM) -> Dict[str, Any]:
                     )
 
                 outputs = self.compute(model, all_instances, do_slice=False)
-
-                if model.accelerator.process_index != 0:
+                
+                if not is_main_process:
                     continue
 
                 generated_examples = []
diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
index 185f33c5..25a5c698 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
@@ -208,4 +208,4 @@ oasst-sft-pythia-12b,1.790114083180124,0.3985580883049341,13,790,2,805,1.7391304
 guanaco-13b,3.469596859739131,0.5518606725700214,22,780,3,805,2.919254658385093,verified,1774,3.003787329611614,
 guanaco-7b,2.880002266173913,0.5202924149314048,21,783,1,805,2.670807453416149,verified,1364,2.871116813131697,
 Qwen1.5-1.8B-Chat,3.70555681579365,0.5811750995496215,27,774,3,804,3.544776119402985,verified,2673,2.588498849185137,
-baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568,
\ No newline at end of file
+baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568,
diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv
index 621ffd75..ad2e31af 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv
@@ -186,5 +186,3 @@ Mistral-7B-Instruct-v0.3,-1.5007159011881868,0.9845683091847074,-1.7652759895328
 Shopee-SlimMoA-v1,-0.6930943742294789,0.5778443790027642,1.4506276222723822
 blendaxai-gm-l6-vo31,-1.4827230167114802,0.8256378421072179,1.5942312525409852
 REBEL-Llama-3-8B-Instruct-Armo,-1.0427168605260002,0.6464073051877255,0.0395191056877229
-model_hf_model_args_pretrained=mlfoundations-dev__gemma-simpo-reproduction,-1.1818376919023723,0.6835318362039150,1.1479555832649320
-model_hf_model_args_pretrained=mlfoundations-dev__gemma-oh-preferences,-1.8345282763259563,0.7434213717748921,-9.8937244442602008
\ No newline at end of file
diff --git a/eval/eval.py b/eval/eval.py
index c806be5e..6f5fad43 100644
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -247,7 +247,7 @@ def evaluate(
         cpu_count = os.cpu_count()
 
         max_workers = min(len(valid_tasks), cpu_count * 2)
-        if lm.world_size <= 1 or lm.accelerator.process_index == 0:
+        if (hasattr(lm, 'accelerator') and lm.accelerator.process_index == 0) or lm.world_size <= 1 or :
             with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                 evaluate_results = list(
                     executor.map(