evalplus
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 7 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎evalperf.html‎
Lines changed: 284 additions & 0 deletions b/‎evalperf.html‎
Lines changed: 284 additions & 0 deletions
@@ -1,21 +1,17 @@
 repos:
   - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
         name: isort (python)
         args: ["--profile", "black"]
   - repo: https://github.com/psf/black
-    rev: 22.6.0
+    rev: 24.10.0
     hooks:
       - id: black
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v5.0.0
     hooks:
       - id: check-yaml
       - id: end-of-file-fixer
       - id: trailing-whitespace
-  - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: "v3.1.0"
-    hooks:
-      - id: prettier
@@ -0,0 +1,284 @@
+<!doctype html>
+<html>
+<link rel="preconnect" href="https://fonts.googleapis.com" />
+<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@100;400&display=swap" rel="stylesheet" />
+
+
+<head>
+  <meta charset="UTF-8" />
+  <title>
+    EvalPerf: Evaluating Language Models for Efficient Code Generation
+  </title>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.0/papaparse.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/echarts.min.js"></script>
+  <link rel="icon" href="https://images.emojiterra.com/google/noto-emoji/unicode-15/color/1024px/1f9d1-1f4bb.png" />
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" />
+
+  <link href="https://cdn.jsdelivr.net/npm/[email protected]/themes/prism.css" rel="stylesheet" />
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/components/prism-core.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/plugins/autoloader/prism-autoloader.min.js"></script>
+
+  <style>
+    body {
+      font-family: "JetBrains Mono", monospace;
+      background-color: #ffffff;
+      color: #000000;
+    }
+
+    th,
+    td {
+      text-align: left;
+      width: fit-content;
+      font-size: larger;
+    }
+
+    #notes h3 {
+      margin-top: 1em;
+      font-size: 2em;
+      text-align: center;
+    }
+  </style>
+</head>
+
+<body>
+  <div id="content" class="container d-flex flex-column align-items-center gap-3">
+    <h1 class="text-nowrap mt-5" style="font-size: xx-large;">
+      <b>Evaluating LLMs for Efficient Code Generation</b>
+    </h1>
+    <div class="d-flex flex-row justify-content-center gap-3">
+      <a href="https://openreview.net/forum?id=IBCBMeAhmC"><img
+          src="https://img.shields.io/badge/Paper-COLM'24-a55fed.svg?style=for-the-badge"></a>
+      <a href="https://github.com/evalplus/evalplus"><img
+          src="https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white"
+          alt="github" class="img-fluid" /></a>
+      <a href="https://pypi.org/project/evalplus"><img alt="PyPI - Version"
+          src="https://img.shields.io/pypi/v/evalplus?style=for-the-badge&labelColor=black" class="img-fluid" />
+      </a>
+    </div>
+    <div class="container d-flex flex-row flex-nowrap fs-5">
+
+
+      <div class="container d-flex flex-column align-items-center">
+        <div>
+          🚀 Code Efficiency Evaluation requires:
+          <ul>
+            <li><strong>Performance-exercising tasks & inputs</strong>
+            </li>
+            <li><strong>Meaningful compound metric:</strong>
+            </li>
+          </ul>
+          <p>Based on <strong>Differential Performance Evaluation</strong>, the EvalPerf dataset (current
+            version 20240328) includes:</p>
+          <ul>
+            <li>118 performance-exercising tasks</li>
+            <li>Each task is equipped with a <i>computationally challenging test input</i> generated by the SaS
+              generator</li>
+            <li>Differential Performance Score (DPS): <i>"DPS=80"</i> means <i>"submissions can outperform 80%
+                of LLM solutions..."</i></li>
+            <li>Pairwise comparison of LLMs' code efficiency over common passing tasks to ablate correctness impact
+            </li>
+          </ul>
+          Check out our <a href="https://jw-liu.xyz/assets/pdf/jiawei-colm-evalperf-poster.pdf">COLM'24 poster</a> for
+          a more detailed overview!
+        </div>
+
+        <pre style="padding-top: 0; padding-bottom: 0;">
+          <code class="language-bash">
+pip install --upgrade "evalplus[perf,vllm] @ git+https://github.com/evalplus/evalplus"
+# Or `pip install "evalplus[perf,vllm]" --upgrade` for the latest stable release
+
+sudo sh -c 'echo 0 > /proc/sys/kernel/perf_event_paranoid' # Enable perf
+evalplus.evalperf --model "ise-uiuc/Magicoder-S-DS-6.7B" \
+                  --backend vllm</code>
+          </pre>
+        <br />
+        <table id="leaderboard"
+          class="table table-responsive table-striped table-bordered flex-shrink-1 border border-5">
+        </table>
+        <h2 id="sponsor" class="text-nowrap mt-5">🤗 Acknowledgment</h2>
+        <p>
+          We thank
+          <a href="https://openai.com/form/researcher-access-program/">OpenAI Researcher Access Program</a> for
+          providing part of the compute.
+        </p>
+      </div>
+    </div>
+  </div>
+
+  <script>
+    const contextTable = document.getElementById("leaderboard");
+    const linkMapping = new Map([]);
+    const hfLinkPrefix = "https://huggingface.co/";
+    const dataUrlPrefix = "results/evalperf";
+
+    // Load data
+    var data = null;
+    var dataUrl = dataUrlPrefix + "/COMBINED-RESULTS.json";
+    var xhr = new XMLHttpRequest();
+    xhr.open("GET", dataUrl, false); // false makes the request synchronous
+    xhr.send();
+
+    if (xhr.status === 200) {
+      var results = JSON.parse(xhr.responseText);
+      data = new Map(Object.entries(results));
+      // convert each value to Map
+      data.forEach((value, modelId) => {
+        data.set(modelId, new Map(Object.entries(value)));
+      });
+      data.forEach((value, modelId) => {
+        // add link to model
+        if (modelId.includes("--")) {
+          modelId = modelId.split("--");
+          modelOrg = modelId[0];
+          modelId = modelId[1];
+          url = hfLinkPrefix + modelOrg + "/" + modelId;
+          linkMapping.set(modelId, url);
+        } else if (modelId.startsWith("gpt-4-")) {
+          linkMapping.set(
+            modelId,
+            "https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4",
+          );
+        } else if (modelId.startsWith("gpt-3.5-")) {
+          linkMapping.set(
+            modelId,
+            "https://platform.openai.com/docs/models/gpt-3-5-turbo",
+          );
+        } else if (modelId.startsWith("claude-3-")) {
+          linkMapping.set(
+            modelId,
+            "https://www.anthropic.com/news/claude-3-family",
+          );
+        } else if (modelId.startsWith("gemini-1.5-pro")) {
+          linkMapping.set(
+            modelId,
+            "https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024/#sundar-note",
+          );
+        } else if (modelId.startsWith("gemini-1.5-flash")) {
+          linkMapping.set(
+            modelId,
+            "https://deepmind.google/technologies/gemini/flash/",
+          );
+        } else if (modelId.startsWith("gpt-4o-")) {
+          linkMapping.set(modelId, "https://openai.com/index/hello-gpt-4o/");
+        } else if (modelId.startsWith("deepseek-chat")) {
+          linkMapping.set(modelId, "https://chat.deepseek.com/")
+        }
+      });
+    } else {
+      alert(
+        "Failed to load data from " + dataUrl + ". Please try again later.",
+      );
+    }
+    const globalData = data;
+    const winrate_tag = "🏆 Win Rate (%)";
+
+    // each row represents a model
+    const theaders = [
+      "#", // rank
+      "Model", // model name
+      "DPS",
+      // "DPS Norm",
+      "pass@1",
+      winrate_tag, // computed over the same set of passing solutions
+    ];
+
+    const displayTable = (table) => {
+      var thead = document.createElement("thead");
+      var headerRow = document.createElement("tr");
+      // headers
+      theaders.forEach(function (header) {
+        var th = document.createElement("th");
+        th.classList.add("text-nowrap");
+        th.textContent = header;
+
+        if (header == winrate_tag) {
+          th.style.backgroundColor = "#EEFFEE";
+        }
+
+        headerRow.appendChild(th);
+      });
+      thead.appendChild(headerRow);
+      table.appendChild(thead);
+
+      // convert data to array of Map
+      data = Array.from(globalData);
+      data = data.map(
+        ([modelId, value]) => new Map([["modelId", modelId], ...value]),
+      )
+      data.sort((a, b) => b.get("win_rate") - a.get("win_rate"));
+
+      var tbody = document.createElement("tbody");
+      // add rank
+      var rank = 0;
+      var last_best = null;
+      var n_last_best = 1;
+      data.forEach((row) => {
+        var dataRow = document.createElement("tr");
+        // rank
+        var rankCell = document.createElement("td");
+        dataRow.appendChild(rankCell);
+        var modelCell = document.createElement("td");
+        var modelLink = document.createElement("a");
+        var modelId = row.get('modelId');
+        var modelName = modelId;
+        if (modelId.includes("--")) {
+          modelName = modelId.split("--")[1];
+        }
+        var cur_win_rate = row.get('win_rate').toFixed(3);
+        if (last_best != cur_win_rate) {
+          rank += n_last_best;
+          last_best = cur_win_rate;
+          rankCell.textContent = rank;
+          n_last_best = 1;
+        } else {
+          n_last_best += 1;
+        }
+        if (rank == 1) {
+          modelLink.textContent = "🥇 " + modelName;
+        } else if (rank == 2) {
+          modelLink.textContent = "🥈 " + modelName;
+        } else if (rank == 3) {
+          modelLink.textContent = "🥉 " + modelName;
+        } else {
+          modelLink.textContent = modelName;
+        }
+        if (linkMapping.has(modelName)) {
+          modelLink.href = linkMapping.get(modelName);
+        }
+        modelLink.classList.add("link-underline-primary");
+        modelLink.classList.add("text-nowrap");
+        modelCell.appendChild(modelLink);
+        dataRow.appendChild(modelCell);
+        dpsRow = document.createElement("td");
+        dpsRow.textContent = row.get("dps").toFixed(1);
+        dataRow.appendChild(dpsRow);
+        // dpsNormRow = document.createElement("td");
+        // dpsNormRow.textContent = row.get("dps_norm").toFixed(1);
+        // dataRow.appendChild(dpsNormRow);
+        passRow = document.createElement("td");
+        passRow.textContent = row.get("pass@1").toFixed(1);
+        dataRow.appendChild(passRow);
+        winRateRow = document.createElement("td");
+        winRateRow.textContent = (row.get('win_rate') * 100).toFixed(1);
+        winRateRow.style.backgroundColor = "#EEFFEE";
+        dataRow.appendChild(winRateRow);
+        tbody.appendChild(dataRow);
+      });
+      table.appendChild(tbody);
+    };
+
+    const clearTable = () => {
+      contextTable.innerHTML = "";
+    };
+
+    const main = () => {
+      clearTable();
+      displayTable(contextTable);
+    };
+
+    main();
+  </script>
+</body>
+
+</html>