fix typos (huggingface#702)

omahs · web-flow · commit 77efde060a33 · 2025-05-06T10:47:11.000+02:00
diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx
@@ -56,7 +56,7 @@ custom_metric = SampleLevelMetric(
     category=MetricCategory.IGNORED,
     use_case=MetricUseCase.NONE,
     sample_level_fn=lambda x: x,  # how to compute score for one sample
-    corpus_level_fn=np.mean,  # How to aggreagte the samples metrics
+    corpus_level_fn=np.mean,  # How to aggregate the samples metrics
 )
 ```
 
diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx
@@ -11,7 +11,7 @@ lighteval sglang \
 
 `sglang` is able to distribute the model across multiple GPUs using data
 parallelism and tensor parallelism.
-You can choose the parallelism method by setting in the the `model_args`.
+You can choose the parallelism method by setting in the `model_args`.
 
 For example if you have 4 GPUs you can split it across using `tp_size`:
 
diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx
@@ -15,7 +15,7 @@ lighteval vllm \
 
 `vllm` is able to distribute the model across multiple GPUs using data
 parallelism, pipeline parallelism or tensor parallelism.
-You can choose the parallelism method by setting in the the `model_args`.
+You can choose the parallelism method by setting in the `model_args`.
 
 For example if you have 4 GPUs you can split it across using `tensor_parallelism`:
 
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -584,7 +584,7 @@ def recreate_metadata_card(self, repo_id: str) -> None:  # noqa: C901
             dataset_summary=f"Dataset automatically created during the evaluation run of model "
             f"[{self.general_config_logger.model_name}](https://huggingface.co/{self.general_config_logger.model_name})"
             f"{org_string}.\n\n"
-            f"The dataset is composed of {len(card_metadata) - 1} configuration, each one coresponding to one of the evaluated task.\n\n"
+            f"The dataset is composed of {len(card_metadata) - 1} configuration, each one corresponding to one of the evaluated task.\n\n"
             f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each "
             f'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n'
             f'An additional configuration "results" store all the aggregated results of the run.\n\n'
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
@@ -137,7 +137,7 @@ def log_model_info(self, generation_parameters: dict, model_info: ModelInfo) ->
         Logs the model information.
 
         Args:
-            model_config: the model config used to initalize the model.
+            model_config: the model config used to initialize the model.
             model_info (ModelInfo): Model information to be logged.
 
         """
diff --git a/src/lighteval/main_custom.py b/src/lighteval/main_custom.py
@@ -35,10 +35,10 @@
 TOKEN = os.getenv("HF_TOKEN")
 CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
 
-HELP_PANNEL_NAME_1 = "Common Paramaters"
+HELP_PANNEL_NAME_1 = "Common Parameters"
 HELP_PANNEL_NAME_2 = "Logging Parameters"
-HELP_PANNEL_NAME_3 = "Debug Paramaters"
-HELP_PANNEL_NAME_4 = "Modeling Paramaters"
+HELP_PANNEL_NAME_3 = "Debug Parameters"
+HELP_PANNEL_NAME_4 = "Modeling Parameters"
 
 
 @app.command(rich_help_panel="Evaluation Backends")
diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py
@@ -58,7 +58,7 @@
 
 def loglikelihood_acc_metric(normalization: LogProbNormalization | None = None) -> SampleLevelMetric:
     """
-    Creates a accuracy (loglikelihood) metric, which returns accuracy given normalization.
+    Creates an accuracy (loglikelihood) metric, which returns accuracy given normalization.
     """
 
     normalization_str = normalization.name if normalization else ""
@@ -199,7 +199,7 @@ def multilingual_extractive_match_metric(
 
     Known issues:
     - If the task is to simplify an expression, the metric might overestimate the accuracy. This is because if the model doesn't output any anchor for the extraction (e.g final answer is..),
-        it's possible that the the extracted prediction will be the expression to simplify. Because we do simplifications ourselves, it can thus happen that sympy will correctly simplify the expression,
+        it's possible that the extracted prediction will be the expression to simplify. Because we do simplifications ourselves, it can thus happen that sympy will correctly simplify the expression,
         thus it will match gold, despite model not doing anything. PRs to fix this are welcome.
 
     - There is currently no StringExtractionConfig, so if the gold is \boxed{\text{Friday}} and model outputs Friday it will not match, because nothing will be extracted.
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -394,7 +394,7 @@ class Metrics(Enum):
                     language=Language.ENGLISH,
                 ),
             ),
-            # Uses sympy for comparision
+            # Uses sympy for comparison
             sample_scoring_function=compare_gold_target,
         ).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
@@ -426,7 +426,7 @@ class Metrics(Enum):
                     language=Language.ENGLISH,
                 ),
             ),
-            # Uses sympy for comparision
+            # Uses sympy for comparison
             sample_scoring_function=compare_gold_target,
         ).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
@@ -458,7 +458,7 @@ class Metrics(Enum):
                     language=Language.ENGLISH,
                 ),
             ),
-            # Uses sympy for comparision
+            # Uses sympy for comparison
             sample_scoring_function=compare_gold_target,
         ).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
@@ -490,7 +490,7 @@ class Metrics(Enum):
                     language=Language.ENGLISH,
                 ),
             ),
-            # Uses sympy for comparision
+            # Uses sympy for comparison
             sample_scoring_function=compare_gold_target,
         ).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
@@ -522,7 +522,7 @@ class Metrics(Enum):
                     language=Language.ENGLISH,
                 ),
             ),
-            # Uses sympy for comparision
+            # Uses sympy for comparison
             sample_scoring_function=compare_gold_target,
         ).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
@@ -554,7 +554,7 @@ class Metrics(Enum):
                     language=Language.ENGLISH,
                 ),
             ),
-            # Uses sympy for comparision
+            # Uses sympy for comparison
             sample_scoring_function=compare_gold_target,
         ).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
diff --git a/src/lighteval/models/endpoints/tgi_model.py b/src/lighteval/models/endpoints/tgi_model.py
@@ -70,7 +70,7 @@ def __init__(self, config: TGIModelConfig) -> None:
         self._max_gen_toks = 256
         self.model_info = requests.get(f"{config.inference_server_address}/info", headers=headers).json()
         if "model_id" not in self.model_info:
-            raise ValueError("Error occured when fetching info: " + str(self.model_info))
+            raise ValueError("Error occurred when fetching info: " + str(self.model_info))
         if config.model_id:
             self.model_info["model_id"] = config.model_id
         self._tokenizer = AutoTokenizer.from_pretrained(self.model_info["model_id"])
diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py
@@ -533,7 +533,7 @@ def prepare_batch(
         # tensors, then we pack them together into a batch, call the model, and then pick it all apart
         # again because vectorizing is annoying
 
-        # Each sample is concatenated and cut to lenght or padded to max_length
+        # Each sample is concatenated and cut to length or padded to max_length
         for tokens in batch:
             truncated.append(max(len(tokens) - max_context, 0))
 
@@ -717,7 +717,7 @@ def _loglikelihood_single_token(
                 if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
                     # This process got outputs
 
-                    # Gather all the output accross TP
+                    # Gather all the output across TP
                     out = out.transpose(0, 1).contiguous()  # [batch, seq_length, vocab]
 
                     gathered_out = [torch.zeros_like(out) for _ in range(self.parallel_context.tp_pg.size())]
@@ -768,7 +768,7 @@ def _loglikelihood_single_token(
                         batch_cont_tokens.append(cont_toks)
 
                     # Sync all
-                    # Need reshape/padding both locally (on each node) and generally accross nodes
+                    # Need reshape/padding both locally (on each node) and generally across nodes
                     batched_inputs, _ = self.pad_and_gather(batch_model.input_ids)
                     lengths = torch.tensor(batch_model.input_lengths, device=self.device)
                     batched_lengths = self.gather(lengths)
@@ -949,7 +949,7 @@ def _loglikelihood_tokens(
                 if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
                     # This process got outputs
 
-                    # Gather all the output accross TP
+                    # Gather all the output across TP
                     gathered_out = [torch.zeros_like(out) for _ in range(self.parallel_context.tp_pg.size())]
                     dist.all_gather(gathered_out, out, group=self.parallel_context.tp_pg, async_op=False)
                     out = torch.cat(gathered_out, dim=-1)
@@ -1234,7 +1234,7 @@ def greedy_until(
                     padded=[sum(mask == 0) for mask in tokenized["attention_mask"]],
                 )
 
-                # responses, logits and input_ids have all been gathered accross GPUs already
+                # responses, logits and input_ids have all been gathered across GPUs already
                 # but we also grab the original length of these vectors, which have been padded
                 # while being gathered - the added info
                 outputs = decode_tokenized(
diff --git a/src/lighteval/tasks/extended/lcb/codegen_metrics.py b/src/lighteval/tasks/extended/lcb/codegen_metrics.py
@@ -71,7 +71,7 @@ class TimeoutException(Exception):
 
 
 def timeout_handler(signum, frame):
-    print("timeout occured: alarm went off")
+    print("timeout occurred: alarm went off")
     raise TimeoutException
 
 
@@ -180,7 +180,7 @@ def compile_code(code: str, timeout: int) -> ModuleType:
             # else condition allows future extensibility to other platforms
             compiled_sol = tmp_sol.Solution()
         else:
-            # do nothing in the other case since function is accesible
+            # do nothing in the other case since function is accessible
             compiled_sol = tmp_sol
 
         assert compiled_sol is not None
@@ -332,9 +332,9 @@ def grade_stdio(  # noqa: C901
             if stripped_prediction_line == stripped_gt_out_line:
                 continue
 
-            # CASE 2: element-wise comparision
+            # CASE 2: element-wise comparison
             # if there are floating elements
-            # use `decimal` library for good floating point comparision
+            # use `decimal` library for good floating point comparison
             # otherwise gotcha: np.isclose(50000000000000000, 50000000000000001) = True
             # note that we should always be able to convert to decimals
 
diff --git a/src/lighteval/tasks/templates/hellaswag.py b/src/lighteval/tasks/templates/hellaswag.py
@@ -78,7 +78,7 @@ def get_hellaswag_prompt_function(
             Must map data from the dataset row to the HellaswagInput format.
             Note: The gold_idx must be an index or list of indices in the continuations list, indicating the correct continuation(s).
         formulation (Formulation, optional): The formulation to use for the task. Defaults to MCFFormulation().
-        wikihow_artifacts (list[str], optional): A list of strings to replace with dot. We have to replace the the texts with dots because
+        wikihow_artifacts (list[str], optional): A list of strings to replace with dot. We have to replace the texts with dots because
             of wikihow source.
 
     Returns:

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ custom_metric = SampleLevelMetric(`
`56`	`56`	`category=MetricCategory.IGNORED,`
`57`	`57`	`use_case=MetricUseCase.NONE,`
`58`	`58`	`sample_level_fn=lambda x: x, # how to compute score for one sample`
`59`		`- corpus_level_fn=np.mean, # How to aggreagte the samples metrics`
	`59`	`+ corpus_level_fn=np.mean, # How to aggregate the samples metrics`
`60`	`60`	`)`
`61`	`61`	```
`62`	`62`