repoqa: update models and add ack

ganler · ganler · commit 8920704cd659 · 2024-05-23T04:20:34.000-05:00
diff --git a/repoqa.html b/repoqa.html
@@ -337,6 +337,28 @@ <h3 id="limit" class="text-nowrap mt-5">Known limitations</h3>
               (Thanks @chrisgorgo for the suggestion!)
             </li>
           </ul>
+          <h2 id="sponsor" class="text-nowrap mt-5">🤗 Acknowledgment</h2>
+          <p>
+            Running long-context evaluations can be costly -- we thank
+            <a href="https://deepmind.google/">Google DeepMind</a>
+            and
+            <a href="https://openai.com/form/researcher-access-program/"
+              >OpenAI Researcher Access Program</a
+            >
+            for their generous API credits!
+          </p>
+          <p>
+            Meanwhile, note that RepoQA is a transparent research project
+            started by students at UIUC. We assure the reproducibility and
+            fairness of the evaluation as well as the indenpendence of our
+            benchmark design that none of these will be optimized or compromised
+            for models from specific organizations. The outputs and results of
+            evaluated models can be found at our
+            <a
+              href="https://github.com/evalplus/repoqa/releases/tag/dev-results"
+              >GitHub release page</a
+            >.
+          </p>
         </div>
       </div>
     </div>
diff --git a/results/repoqa/COMBINED-RESULTS.json b/results/repoqa/COMBINED-RESULTS.json
@@ -2015,6 +2015,90 @@
       }
     }
   },
+  "microsoft/Phi-3-small-128k-instruct": {
+    "eval_date": "2024-05-23 02:31:23.338528",
+    "train_size": "128k",
+    "scores": {
+      "all": {
+        "0.0": { "pass@1": 0.792 },
+        "0.1": { "pass@1": 0.662 },
+        "0.2": { "pass@1": 0.612 },
+        "0.3": { "pass@1": 0.564 },
+        "0.4": { "pass@1": 0.526 },
+        "0.5": { "pass@1": 0.49 },
+        "0.6": { "pass@1": 0.466 },
+        "0.7": { "pass@1": 0.444 },
+        "0.8": { "pass@1": 0.396 },
+        "0.9": { "pass@1": 0.322 },
+        "1.0": { "pass@1": 0.242 }
+      },
+      "python": {
+        "0.0": { "pass@1": 0.74 },
+        "0.1": { "pass@1": 0.57 },
+        "0.2": { "pass@1": 0.52 },
+        "0.3": { "pass@1": 0.46 },
+        "0.4": { "pass@1": 0.41 },
+        "0.5": { "pass@1": 0.36 },
+        "0.6": { "pass@1": 0.34 },
+        "0.7": { "pass@1": 0.29 },
+        "0.8": { "pass@1": 0.25 },
+        "0.9": { "pass@1": 0.23 },
+        "1.0": { "pass@1": 0.21 }
+      },
+      "cpp": {
+        "0.0": { "pass@1": 0.83 },
+        "0.1": { "pass@1": 0.68 },
+        "0.2": { "pass@1": 0.62 },
+        "0.3": { "pass@1": 0.57 },
+        "0.4": { "pass@1": 0.55 },
+        "0.5": { "pass@1": 0.53 },
+        "0.6": { "pass@1": 0.52 },
+        "0.7": { "pass@1": 0.5 },
+        "0.8": { "pass@1": 0.48 },
+        "0.9": { "pass@1": 0.4 },
+        "1.0": { "pass@1": 0.35 }
+      },
+      "java": {
+        "0.0": { "pass@1": 0.83 },
+        "0.1": { "pass@1": 0.74 },
+        "0.2": { "pass@1": 0.69 },
+        "0.3": { "pass@1": 0.67 },
+        "0.4": { "pass@1": 0.6 },
+        "0.5": { "pass@1": 0.58 },
+        "0.6": { "pass@1": 0.55 },
+        "0.7": { "pass@1": 0.53 },
+        "0.8": { "pass@1": 0.46 },
+        "0.9": { "pass@1": 0.39 },
+        "1.0": { "pass@1": 0.3 }
+      },
+      "typescript": {
+        "0.0": { "pass@1": 0.9 },
+        "0.1": { "pass@1": 0.81 },
+        "0.2": { "pass@1": 0.78 },
+        "0.3": { "pass@1": 0.7 },
+        "0.4": { "pass@1": 0.68 },
+        "0.5": { "pass@1": 0.62 },
+        "0.6": { "pass@1": 0.58 },
+        "0.7": { "pass@1": 0.56 },
+        "0.8": { "pass@1": 0.49 },
+        "0.9": { "pass@1": 0.34 },
+        "1.0": { "pass@1": 0.14 }
+      },
+      "rust": {
+        "0.0": { "pass@1": 0.66 },
+        "0.1": { "pass@1": 0.51 },
+        "0.2": { "pass@1": 0.45 },
+        "0.3": { "pass@1": 0.42 },
+        "0.4": { "pass@1": 0.39 },
+        "0.5": { "pass@1": 0.36 },
+        "0.6": { "pass@1": 0.34 },
+        "0.7": { "pass@1": 0.34 },
+        "0.8": { "pass@1": 0.3 },
+        "0.9": { "pass@1": 0.25 },
+        "1.0": { "pass@1": 0.21 }
+      }
+    }
+  },
   "gemini-1.5-flash-latest": {
     "eval_date": "2024-05-19 04:32:12.200298",
     "train_size": "1000k",