huggingface · plaguss · Feb 19, 2025 · Feb 19, 2025 · Aug 26, 2025 · Aug 26, 2025
diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py
@@ -24,14 +24,15 @@
 
 
 if can_load_extended_tasks():
+    import lighteval.tasks.extended.codeelo.main as codeelo
     import lighteval.tasks.extended.ifeval.main as ifeval
     import lighteval.tasks.extended.lcb.main as lcb
     import lighteval.tasks.extended.mix_eval.main as mix_eval
     import lighteval.tasks.extended.mt_bench.main as mt_bench
     import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
     import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks
 
-    AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, lcb]
+    AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, lcb, codeelo]
 
 else:
     AVAILABLE_EXTENDED_TASKS_MODULES = []
diff --git a/src/lighteval/tasks/extended/codeelo/main.py b/src/lighteval/tasks/extended/codeelo/main.py
@@ -0,0 +1,106 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Code obtained from: https://github.com/QwenLM/CodeElo
+
+Usage:
+lighteval vllm \
+    "pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilisation=0.8,generation_parameters={temperature: 0.7,repetition_penalty:1.1,top_p:0.8,top_k:20}" \
+    "extended|codeelo:rating|0|0" \
+    --use-chat-template
+"""
+
+from typing import Any
+
+import numpy as np
+from aenum import extend_enum
+
+from lighteval.metrics.metrics import MetricCategory, Metrics, MetricUseCase, SampleLevelMetric
+from lighteval.tasks.extended.codeelo.utils import LANG_MAP, extract_code_blocks, make_html_problem, submit_code
+from lighteval.tasks.lighteval_task import Doc, LightevalTaskConfig
+
+
+def codeelo_prompt_fn(line: dict[str, Any], task_name: str = "codeelo:rating") -> Doc:
+    instruction = "You are a coding expert. Given a competition-level coding problem, you need to write a C++ program to solve it. You may start by outlining your thought process. In the end, please provide the complete code in a code block enclosed with ``` ```."
+    return Doc(
+        task_name=task_name,
+        query=f"{instruction}\n\n{make_html_problem(line)}",
+        choices=[""],
+        gold_index=0,
+        specific={
+            "problem_id": line["id"],
+        },
+    )
+
+
+def codeelo(predictions: list[str], formatted_doc: Doc, **kwargs) -> float:
+    """Estimates the Pass@1 metric for the code generation task.
+    Extract the code from each prediction, Runs it for each sample and generations,
+    and computes the Pass@1 over the outputs.
+    """
+    code_blocks = [extract_code_blocks(pred)[0] for pred in predictions]
+    # Extract the code and language
+    # TODO: For the moment we are only considering a single generation per problem, there will be 8 (like Pass@1:8)
+    for code_block in code_blocks:
+        if not code_block:
+            return 0.0
+        code, lang = code_block
+        lang_id = LANG_MAP.get(lang, None)
+        if not lang_id:
+            return 0.0
+        submission_id = submit_code(formatted_doc.specific["problem_id"], lang_id, code, tag="")
+        if isinstance(submission_id, str):
+            return 0.0
+
+    return 0.0
+
+
+codeelo_metric = SampleLevelMetric(
+    metric_name="codeelo_rating@8",  # Generate 8 outputs per problem
+    category=MetricCategory.GENERATIVE_SAMPLING,
+    use_case=MetricUseCase.REASONING,
+    higher_is_better=True,
+    sample_level_fn=codeelo,
+    corpus_level_fn=np.mean,
+)
+
+
+extend_enum(Metrics, "codeelo_metric", codeelo_metric)
+
+
+task = LightevalTaskConfig(
+    name="codeelo:rating",
+    suite=["extended"],
+    prompt_function=codeelo_prompt_fn,
+    hf_repo="Qwen/CodeElo",
+    hf_subset="test",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    generation_size=32768,
+    metric=[Metrics.codeelo_metric],
-    metric=[Metrics.codeelo_metric],
+    metrics=[Metrics.codeelo_metric],
-    metric=[Metrics.codeelo_metric],
+    metrics=[Metrics.codeelo_metric],
+    stop_sequence=[],  # no stop sequence, will use EOS token
+    trust_dataset=True,
+    version=0,
+)
+
+
+TASKS_TABLE = [task]
diff --git a/src/lighteval/tasks/extended/codeelo/utils.py b/src/lighteval/tasks/extended/codeelo/utils.py
@@ -0,0 +1,266 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import re
+import time
+from typing import Any
+
+import requests
+from scipy import optimize
+
+
+URL_CODEFORCES_STANDINGS = "https://codeforces.com/api/contest.standings?contestId={contest_id}&showUnofficial=false"
+URL_RATING_CHANGES = "https://codeforces.com/api/contest.ratingChanges?contestId={contest_id}"
+
+
+def make_html_problem(line: dict[str, Any]) -> str:
+    title = line["title"]
+    html = f"<html><body><h1>{title}</h1>"
+    input_, output = line["input"], line["output"]
+    interaction = line["interaction"]
+    note = line["note"]
+    examples = line["examples"]
+    # Use interaction if provided, otherwise use input and output
+    html += f"<h2>Description</h2><div>{line['description']}</div>"
+    if interaction:
+        html += f"<h2>Interaction</h2><div>{interaction}</div>"
+    else:
+        html += f"<h2>Input</h2><div>{input_}</div>"
+        html += f"<h2>Output</h2><div>{output}</div>"
+
+    # The example is always present
+    example_text = ""
+    for example in examples:
+        example_text += f"<div>Input:\n{example[0]}</div><div>Output:\n{example[1]}</div>"
+    html += f"<h2>{'Example' if len(examples) == 1 else 'Examples'}</h2>{example_text}"
+
+    if note:
+        html += f"<h2>Note</h2><div>{note}</div>"
+
+    html += "</body></html>"
+    return html
+
+
+# Let the whole mapping defined, but the output should be cpp
+LANG_MAP = {
+    "kotlin": 88,  # Kotlin 1.9.21
+    "cpp": 91,  # GNU G++23 14.2 (64 bit, msys2)
+    "ruby": 67,  # Ruby 3.2.2
+    "d": 28,  # D DMD32 v2.105.0
+    "python": 70,  # PyPy 3.10 (7.3.15, 64bit)
+    "pascal": 51,  # PascalABC.NET 3.8.3
+    "rust": 75,  # Rust 1.75.0 (2021)
+    "go": 32,  # Go 1.22.2
+    "node.js": 55,  # Node.js 15.8.0 (64bit)
+    "haskell": 12,  # Haskell GHC 8.10.1
+    "javascript": 34,  # JavaScript V8 4.8.0
+    "csharp": 79,  # C# 10, .NET SDK 6.0
+    "perl": 13,  # Perl 5.20.1
+    "java": 87,  # Java 21 64bit
+    "ocaml": 19,  # OCaml 4.02.1
+    "delphi": 3,  # Delphi 7
+    "php": 6,  # PHP 8.1.7
+    "scala": 20,  # Scala 2.12.8
+    "c": 43,  # GNU GCC C11 5.1.0
+}
+
+
+def extract_code_blocks(text: str) -> list[tuple[str, str]]:
+    """Extracts code blocks from a text, returning a list of tuples with the language and code (if found)."""
+    pattern = r"```(\w*)\n(.*?)\n```"
+    matches = re.findall(pattern, text, re.DOTALL)
+    return matches
+
+
+def submit_code(prob: str, lang_id: int, code: str, tag: str = "", retry: int = 3, delay: int = 10) -> int | str:
+    """Submits code for a specific problem to the API endpoint.
+
+    Args:
+        prob (str): The problem identifier to submit code for.
+        lang (int): The programming language id of the submitted code.
+        code (str): The actual code to be submitted.
+        tag (str, optional): Additional tag for the submission. Defaults to empty string.
+        retry (int, optional): Number of retry attempts if the request fails.
+            Defaults to RETRY constant.
+
+    Returns:
+        dict/str: If successful, returns a JSON response containing submission details.
+            If all retries fail, returns an error message string.
+
+    Example:
+        >>> result = submit_code("2000A", 70, "print('Hello')")
+        >>> result = submit_code("2000A", 91, "int main() {}", "test", retry=3)
+    """
+    token = os.getenv("CODEELO_TOKEN")  # Replace with your own token
+    base_url = os.getenv("CODEELO_BASE_URL")
+
+    if not token or not base_url:
+        raise ValueError("Please set the CODEELO_TOKEN and CODEELO_BASE_URL environment variables.")
+
+    try:
+        url = f"{base_url}/submit_code"
+        headers = {"Content-Type": "application/json", "Authorization": token}
+        payload = {"prob": prob, "lang": lang_id, "code": code, "tag": tag}
+        response = requests.post(url, json=payload, headers=headers, timeout=60)
+        assert response.status_code == 200, "Failed to submit code"
+        return response.json()["submission_id"]
+
+    except Exception as e:
+        if retry > 0:
+            print(f"Failed to submit code, retrying in {delay} seconds")
+            time.sleep(delay)
+            return submit_code(prob, lang_id, code, tag, retry - 1)
+        else:
+            return f"Failed to submit code: {str(e)}"
+
+
+def calc_elo_rating(contest_id: int, problem_status: dict[str, Any]) -> int:  # noqa: C901
+    """Compute the ELO rating for the given contest id.
+
+    Args:
+        contest_id (int): _description_
+        problem_status (dict[str, Any]): _description_
+
+    Returns:
+        int: _description_
+    """
+    standings = requests.get(URL_CODEFORCES_STANDINGS.format(contest_id=contest_id)).json()
+    rating_changes = requests.get(URL_RATING_CHANGES.format(contest_id=contest_id)).json()
+
+    try:
+        handle_set = {
+            standings["result"]["rows"][i]["party"]["members"][0]["handle"]
+            for i in range(len(standings["result"]["rows"]))
+        } and {rating_changes["result"][i]["handle"] for i in range(len(rating_changes["result"]))}
+        standings["result"]["rows"] = [
+            standings["result"]["rows"][i]
+            for i in range(len(standings["result"]["rows"]))
+            if standings["result"]["rows"][i]["party"]["members"][0]["handle"] in handle_set
+        ]
+        rating_changes["result"] = [
+            rating_changes["result"][i]
+            for i in range(len(rating_changes["result"]))
+            if rating_changes["result"][i]["handle"] in handle_set
+        ]
+
+        assert (len(standings["result"]["rows"]) == len(rating_changes["result"])) and len(
+            standings["result"]["rows"]
+        ) > 200, "No result"
+
+    except Exception as e:
+        print(e)
+
+    if (
+        ("result" not in standings)
+        or ("result" not in rating_changes)
+        or (len(standings["result"]["rows"]) != len(rating_changes["result"]))
+        or (len(standings["result"]["rows"]) <= 200)
+    ):
+        print("No result, return 0")
+        return 0
+
+    max_rating = 0
+    for i in range(len(rating_changes["result"])):
+        max_rating = max(max_rating, rating_changes["result"][i]["oldRating"])
+
+    # Obtain score and penalty
+    score = 0
+    penalty = 0
+
+    for problem in standings["result"]["problems"]:
+        prob = f"{problem['contestId']}{problem['index']}"
+        if prob in problem_status.keys():
+            for ith, status in enumerate(problem_status[prob]):
+                if status == "AC":
+                    if "points" in problem:
+                        score += max(0, problem["points"] - 50 * ith)
+                    else:
+                        score += 1
+                        penalty += ith * 10
+                    break
+
+    # Obtain number of participants and target rank
+    n = len(standings["result"]["rows"])
+
+    rank = n
+    for i in range(n):
+        if (standings["result"]["rows"][i]["points"] < score) or (
+            (standings["result"]["rows"][i]["points"] == score)
+            and (standings["result"]["rows"][i]["penalty"] > penalty)
+        ):
+            rank = i
+            break
+
+    return find_rating(rating_changes, rank, max_rating=max_rating)
+
+
+def calculate_elo_expectation(candidate_rating: float, player_ratings: list[float]) -> float:
+    """Calculate the expected score based on Elo rating formula"""
+    return 1 + sum(1 / (1 + 10 ** ((candidate_rating - rating) / 400)) for rating in player_ratings)
+
+
+def find_rating(rating_changes: dict, target_rank: float, max_rating: int = 4000) -> int:
+    """Find the rating using scipy's root finding methods"""
+    old_ratings = [change["oldRating"] for change in rating_changes["result"]]
+
+    def rating_difference(x: float) -> float:
+        return calculate_elo_expectation(x, old_ratings) - target_rank
+
+    # Use binary search method from scipy
+    result = optimize.root_scalar(rating_difference, bracket=[0, max_rating + 100], method="brentq")
+
+    return int(result.root)
+
+
+def check_status(submission_id, retry: int = 3, delay: int = 10) -> dict[str, Any] | str:
+    """Checks the status of a specific submission using the API endpoint.
+
+    Args:
+        submission_id (str): The ID of the submission to check.
+        retry (int, optional): Number of retry attempts if the request fails.
+
+    Returns:
+        dict/str: If successful, returns a JSON response containing submission status.
+            If all retries fail, returns an error message.
+
+    Example:
+        >>> status = check_status("12345")
+        >>> status = check_status("67890", retry=3)
+    """
+    token = os.getenv("CODEELO_TOKEN")
+    base_url = os.getenv("CODEELO_BASE_URL")
+
+    try:
+        url = f"{base_url}/check_status"
+        headers = {"Content-Type": "application/json", "Authorization": token}
+        params = {"submission_id": submission_id}
+        response = requests.get(url, headers=headers, params=params, timeout=20)
+        assert response.status_code == 200
+        return response.json()["status_canonical"]
+    except Exception as e:
+        if retry > 0:
+            print(f"Failed to get problem, retrying in {delay} seconds")
+            time.sleep(delay)
+            return check_status(submission_id, retry - 1)
+        else:
+            return f"Failed to get problem: {str(e)}"