|
566 | 566 |
|
567 | 567 | <h1 id="leaderboard">Leaderboard</h1> |
568 | 568 | <div align="center"> |
569 | | -<h1 id="scicode-leaderboard">SciCode Leaderboard</h1> |
570 | 569 | <table> |
571 | 570 | <thead> |
572 | 571 | <tr> |
573 | 572 | <th>Models</th> |
574 | | -<th>Main Problem Resolve Rate</th> |
575 | | -<th><span style="color:grey">Subproblem</span></th> |
| 573 | +<th style="text-align: center;">Main Problem Resolve Rate</th> |
| 574 | +<th style="text-align: center;"><span style="color:grey">Subproblem</span></th> |
576 | 575 | </tr> |
577 | 576 | </thead> |
578 | 577 | <tbody> |
579 | 578 | <tr> |
580 | 579 | <td>🥇 OpenAI o3-mini-low</td> |
581 | | -<td><div align="center"><strong>10.8</strong></div></td> |
582 | | -<td></td> |
| 580 | +<td style="text-align: center;"><strong>10.8</strong></td> |
| 581 | +<td style="text-align: center;"><span style="color:grey">33.3</span></td> |
| 582 | +</tr> |
| 583 | +<tr> |
| 584 | +<td>🥈 OpenAI o3-mini-high</td> |
| 585 | +<td style="text-align: center;"><strong>9.2</strong></td> |
| 586 | +<td style="text-align: center;"><span style="color:grey">34.4</span></td> |
| 587 | +</tr> |
| 588 | +<tr> |
| 589 | +<td>🥉 OpenAI o3-mini-medium</td> |
| 590 | +<td style="text-align: center;"><strong>9.2</strong></td> |
| 591 | +<td style="text-align: center;"><span style="color:grey">33.0</span></td> |
| 592 | +</tr> |
| 593 | +<tr> |
| 594 | +<td>OpenAI o1-preview</td> |
| 595 | +<td style="text-align: center;"><strong>7.7</strong></td> |
| 596 | +<td style="text-align: center;"><span style="color:grey">28.5</span></td> |
| 597 | +</tr> |
| 598 | +<tr> |
| 599 | +<td>Deepseek-R1</td> |
| 600 | +<td style="text-align: center;"><strong>4.6</strong></td> |
| 601 | +<td style="text-align: center;"><span style="color:grey">28.5</span></td> |
| 602 | +</tr> |
| 603 | +<tr> |
| 604 | +<td>Claude3.5-Sonnet</td> |
| 605 | +<td style="text-align: center;"><strong>4.6</strong></td> |
| 606 | +<td style="text-align: center;"><span style="color:grey">26.0</span></td> |
| 607 | +</tr> |
| 608 | +<tr> |
| 609 | +<td>Claude3.5-Sonnet (new)</td> |
| 610 | +<td style="text-align: center;"><strong>4.6</strong></td> |
| 611 | +<td style="text-align: center;"><span style="color:grey">25.3</span></td> |
| 612 | +</tr> |
| 613 | +<tr> |
| 614 | +<td>Deepseek-v3</td> |
| 615 | +<td style="text-align: center;"><strong>3.1</strong></td> |
| 616 | +<td style="text-align: center;"><span style="color:grey">23.7</span></td> |
| 617 | +</tr> |
| 618 | +<tr> |
| 619 | +<td>Deepseek-Coder-v2</td> |
| 620 | +<td style="text-align: center;"><strong>3.1</strong></td> |
| 621 | +<td style="text-align: center;"><span style="color:grey">21.2</span></td> |
| 622 | +</tr> |
| 623 | +<tr> |
| 624 | +<td>GPT-4o</td> |
| 625 | +<td style="text-align: center;"><strong>1.5</strong></td> |
| 626 | +<td style="text-align: center;"><span style="color:grey">25.0</span></td> |
| 627 | +</tr> |
| 628 | +<tr> |
| 629 | +<td>GPT-4-Turbo</td> |
| 630 | +<td style="text-align: center;"><strong>1.5</strong></td> |
| 631 | +<td style="text-align: center;"><span style="color:grey">22.9</span></td> |
| 632 | +</tr> |
| 633 | +<tr> |
| 634 | +<td>OpenAI o1-mini</td> |
| 635 | +<td style="text-align: center;"><strong>1.5</strong></td> |
| 636 | +<td style="text-align: center;"><span style="color:grey">22.2</span></td> |
| 637 | +</tr> |
| 638 | +<tr> |
| 639 | +<td>Gemini 1.5 Pro</td> |
| 640 | +<td style="text-align: center;"><strong>1.5</strong></td> |
| 641 | +<td style="text-align: center;"><span style="color:grey">21.9</span></td> |
| 642 | +</tr> |
| 643 | +<tr> |
| 644 | +<td>Claude3-Opus</td> |
| 645 | +<td style="text-align: center;"><strong>1.5</strong></td> |
| 646 | +<td style="text-align: center;"><span style="color:grey">21.5</span></td> |
| 647 | +</tr> |
| 648 | +<tr> |
| 649 | +<td>Llama-3.1-405B-Chat</td> |
| 650 | +<td style="text-align: center;"><strong>1.5</strong></td> |
| 651 | +<td style="text-align: center;"><span style="color:grey">19.8</span></td> |
| 652 | +</tr> |
| 653 | +<tr> |
| 654 | +<td>Claude3-Sonnet</td> |
| 655 | +<td style="text-align: center;"><strong>1.5</strong></td> |
| 656 | +<td style="text-align: center;"><span style="color:grey">17.0</span></td> |
| 657 | +</tr> |
| 658 | +<tr> |
| 659 | +<td>Qwen2-72B-Instruct</td> |
| 660 | +<td style="text-align: center;"><strong>1.5</strong></td> |
| 661 | +<td style="text-align: center;"><span style="color:grey">17.0</span></td> |
| 662 | +</tr> |
| 663 | +<tr> |
| 664 | +<td>Llama-3.1-70B-Chat</td> |
| 665 | +<td style="text-align: center;"><strong>0.0</strong></td> |
| 666 | +<td style="text-align: center;"><span style="color:grey">17.0</span></td> |
| 667 | +</tr> |
| 668 | +<tr> |
| 669 | +<td>Mixtral-8x22B-Instruct</td> |
| 670 | +<td style="text-align: center;"><strong>0.0</strong></td> |
| 671 | +<td style="text-align: center;"><span style="color:grey">16.3</span></td> |
| 672 | +</tr> |
| 673 | +<tr> |
| 674 | +<td>Llama-3-70B-Chat</td> |
| 675 | +<td style="text-align: center;"><strong>0.0</strong></td> |
| 676 | +<td style="text-align: center;"><span style="color:grey">14.6</span></td> |
583 | 677 | </tr> |
584 | 678 | </tbody> |
585 | 679 | </table> |
586 | | -</div> |
587 | | -<pre><code> | |
588 | | -</code></pre> |
589 | | -<div align="center" style="color:grey">33.3</div> |
590 | | -<pre><code> | |
591 | | -</code></pre> |
592 | | -<p>| 🥈 OpenAI o3-mini-high | </p><div align="center"><strong>9.2</strong></div> | <div align="center" style="color:grey">34.4</div> | |
593 | | -| 🥉 OpenAI o3-mini-medium | <div align="center"><strong>9.2</strong></div> | <div align="center" style="color:grey">33.0</div> | |
594 | | -| OpenAI o1-preview | <div align="center"><strong>7.7</strong></div> | <div align="center" style="color:grey">28.5</div> | |
595 | | -| Deepseek-R1 | <div align="center"><strong>4.6</strong></div> | <div align="center" style="color:grey">28.5</div> | |
596 | | -| Claude3.5-Sonnet | <div align="center"><strong>4.6</strong></div> | <div align="center" style="color:grey">26.0</div> | |
597 | | -| Claude3.5-Sonnet (new) | <div align="center"><strong>4.6</strong></div> | <div align="center" style="color:grey">25.3</div> | |
598 | | -| Deepseek-v3 | <div align="center"><strong>3.1</strong></div> | <div align="center" style="color:grey">23.7</div> | |
599 | | -| Deepseek-Coder-v2 | <div align="center"><strong>3.1</strong></div> | <div align="center" style="color:grey">21.2</div> | |
600 | | -| GPT-4o | <div align="center"><strong>1.5</strong></div> | <div align="center" style="color:grey">25.0</div> | |
601 | | -| GPT-4-Turbo | <div align="center"><strong>1.5</strong></div> | <div align="center" style="color:grey">22.9</div> | |
602 | | -| OpenAI o1-mini | <div align="center"><strong>1.5</strong></div> | <div align="center" style="color:grey">22.2</div> | |
603 | | -| Gemini 1.5 Pro | <div align="center"><strong>1.5</strong></div> | <div align="center" style="color:grey">21.9</div> | |
604 | | -| Claude3-Opus | <div align="center"><strong>1.5</strong></div> | <div align="center" style="color:grey">21.5</div> | |
605 | | -| Llama-3.1-405B-Chat | <div align="center"><strong>1.5</strong></div> | <div align="center" style="color:grey">19.8</div> | |
606 | | -| Claude3-Sonnet | <div align="center"><strong>1.5</strong></div> | <div align="center" style="color:grey">17.0</div> | |
607 | | -| Qwen2-72B-Instruct | <div align="center"><strong>1.5</strong></div> | <div align="center" style="color:grey">17.0</div> | |
608 | | -| Llama-3.1-70B-Chat | <div align="center"><strong>0.0</strong></div> | <div align="center" style="color:grey">17.0</div> | |
609 | | -| Mixtral-8x22B-Instruct | <div align="center"><strong>0.0</strong></div> | <div align="center" style="color:grey">16.3</div> | |
610 | | -| Llama-3-70B-Chat | <div align="center"><strong>0.0</strong></div> | <div align="center" style="color:grey">14.6</div> |<p></p> |
611 | 680 | <p><strong>Note: If the models tie in the Main Problem resolve rate, we will then compare the Subproblems.</strong></p> |
612 | 681 | <!-- Once you've added the results to the submission repository, |
613 | 682 | bring back the table here --> |
614 | 683 | <!-- include-markdown "leaderboard_table.md" --> |
615 | | - |
616 | | - |
| 684 | +</div> |
617 | 685 | <div class="admonition tip"> |
618 | 686 | <p class="admonition-title">How to submit</p> |
619 | 687 | <p>Want to submit your own model? Submit a request via a <a href="https://github.com/scicode-bench/SciCode/issues">Github issue</a>.</p> |
|
0 commit comments