Merge pull request #36 from logic-star-ai/feat/updated_eval

nielstron · web-flow · commit 443e03b38508 · 2025-09-13T21:02:36.000+02:00
Update run scores for latest release
diff --git a/docs/approaches.csv b/docs/approaches.csv
@@ -12,4 +12,5 @@ Aider,https://aider.chat,aider,https://github.com/logic-star-ai/swt-bench?tab=re
 AutoCodeRover,https://autocoderover.dev,autocoderover,https://github.com/logic-star-ai/swt-bench?tab=readme-ov-file#evaluation-results
 LIBRO,https://arxiv.org/abs/2209.11515,kaist,https://github.com/logic-star-ai/swt-bench?tab=readme-ov-file#evaluation-results
 Otter++,https://arxiv.org/abs/2502.05368v1,ibm,https://files.sri.inf.ethz.ch/swt-bench/otter/
-Otter,https://arxiv.org/abs/2502.05368v1,ibm,https://files.sri.inf.ethz.ch/swt-bench/otter/
+Otter,https://arxiv.org/abs/2502.05368v1,ibm,https://files.sri.inf.ethz.ch/swt-bench/otter/
+LogicStar AI,https://logicstar.ai/,logicstar,https://logicstar.ai/blog/logicstar-on-test-generation-benchmark-swt
diff --git a/docs/index.template.html b/docs/index.template.html
@@ -289,6 +289,7 @@
               <p>News</p>
             </div>
             <div class="message-body">
+            <p><strong><time>2025-09-13</time></strong> <a href="https://logicstar.ai">LogicStar</a> claims the first place on SWT-Verified, achieving almost 80% accuracy. Meanwhile, we release a new version of SWT-Bench, resolving various issues in evaluation grading. This results generally in increasing previously reported scores between 2-3%. Special thanks to all contributors!</p>
             <p><strong><time>2025-08-22</time></strong> The 1st and 3rd place on SWT-Verified are reclaimed by the latest release of <a href="https://all-hands.dev">OpenHands</a>, equipped with the newly released <a href="https://openai.com/index/introducing-gpt-5/">GPT-5</a> and <a href="https://openai.com/index/introducing-gpt-5/">GPT-5-mini</a>, respectively.</p>
             <p><strong><time>2025-08-11</time></strong> <a href="https://arxiv.org/abs/2508.06365">e-Otter++</a> claims the first position on the leaderboard with 50.7% and 60.7% on Lite and Verified respectively. They improve upon prior <a href="https://arxiv.org/abs/2502.05368v2">Otter</a> by more deeply integrating execution feedback and heterogeneous prompts in the generation loop.</p>
             <p><strong><time>2025-07-28</time></strong> <a href="https://github.com/uw-swag/AssertFlip">AssertFlip</a> demonstrates a method to generate test cases by flipping the semantics of generated passing tests, achieving superior performance with a success rate of 35.1% on SWT-Bench Lite and 43.4% on Verified.</p>
diff --git a/docs/orgs.csv b/docs/orgs.csv
@@ -4,7 +4,7 @@ ibm,IBM,https://www.ibm.com/,./static/images/logos/IBM.svg
 aws,Amazon Web Services,https://aws.amazon.com/q/,./static/images/logos/Amazon_Web_Services_Logo.svg
 uw,University of Waterloo (SWAG Lab),https://github.com/uw-swag/AssertFlip,./static/images/logos/uw.svg
 allhands,All Hands AI,https://all-hands.dev/,./static/images/logos/allhands.svg
-logicstar,LogicStar,https://logicstar.ai/,./static/images/logos/logicstar.png
+logicstar,LogicStar,https://logicstar.ai/,./static/images/logos/logicstar_symbol_navy.svg
 swe-agent,SWE-agent,https://swe-agent.com/,./static/images/logos/swe-agent.svg
 aider,Aider,https://aider.chat/,./static/images/logos/aider.png
 autocoderover,AutoCodeRover,https://autocoderover.net,./static/images/logos/autocoderover.svg
diff --git a/docs/runs.csv b/docs/runs.csv
@@ -1,8 +1,8 @@
 table_type,emojis,model_name,model_details,success_rate,coverage_increase,date,data_mode
 lite,,AEGIS,,47.8,26.0,2025-02-17,reproduction
-lite,new,e-Otter++,Claude 3.7 Sonnet,50.7,56.4,2025-08-11,unittest
-lite,,Amazon Q Developer Agent,v20250405-dev,37.7,52.7,2025-04-10,unittest
-lite,new,AssertFlip,GPT-4o,35.1,44.2,2025-07-28,unittest
+lite,,e-Otter++,Claude 3.7 Sonnet,52.5,56.4,2025-08-11,unittest
+lite,,Amazon Q Developer Agent,v20250405-dev,39.9,52.7,2025-04-10,unittest
+lite,,AssertFlip,GPT-4o,38.0,44.2,2025-07-28,unittest
 lite,,OpenHands,"Cl. Sonnet 3.5, CI setup",28.3,52.4,2025-02-18,unittest
 lite,,OpenHands,"Cl. Sonnet 3.5, vanilla",22.8,43.6,2025-02-18,unittest
 lite,,SWE-Agent+,GPT-4,18.5,27.6,2024-05-22,unittest
@@ -17,13 +17,14 @@ lite,,AutoCodeRover,GPT-4,9.1,17.9,2024-05-22,unittest
 lite,,LIBRO,GPT-4,14.1,23.8,2024-05-22,unittest
 lite,,Zero-Shot Plus,GPT-4 + BM25,9.4,21.5,2024-05-22,unittest
 lite,,Zero-Shot Base,GPT-4 + BM25,3.6,7.6,2024-05-22,unittest
-verified,new,OpenHands,GPT-5,75.8,66.3,2025-08-22,unittest
-verified,new,e-Otter++,Claude 3.7 Sonnet,60.7,62.3,2025-08-11,unittest
-verified,new,OpenHands,GPT-5-mini,56.8,60.4,2025-08-22,unittest
-verified,,Amazon Q Developer Agent,v20250405-dev,49.0,57.4,2025-04-10,unittest
-verified,new,AssertFlip,GPT-4o,43.4,47.4,2025-07-28,unittest
-verified,,Otter++,GPT-4o,37.0,42.8,2025-03-10,unittest
-verified,,Otter,GPT-4o,31.4,37.6,2025-03-10,unittest
+verified,new,OpenHands,GPT-5,79.8,66.3,2025-08-22,unittest
+verified,,e-Otter++,Claude 3.7 Sonnet,62.1,62.3,2025-08-11,unittest
+verified,new,OpenHands,GPT-5-mini,62.4,60.6,2025-08-22,unittest
+verified,,Amazon Q Developer Agent,v20250405-dev,51.0,57.4,2025-04-10,unittest
+verified,,AssertFlip,GPT-4o,45.5,47.4,2025-07-28,unittest
+verified,,Otter++,GPT-4o,37.4,42.8,2025-03-10,unittest
+verified,,Otter,GPT-4o,31.6,37.6,2025-03-10,unittest
 verified,,OpenHands,Cl. Sonnet 3.5,27.7,52.9,2025-02-28,unittest
 verified,,LIBRO,GPT-4o,17.8,38.0,2025-02-28,unittest
-verified,,Zero-Shot Plus,GPT-4o + BM25,14.3,34.0,2025-02-28,unittest
+verified,,Zero-Shot Plus,GPT-4o + BM25,14.3,34.0,2025-02-28,unittest
+verified,new,LogicStar AI,L*Agent v1, 79.9, 66.5,2025-09-13,unittest
diff --git a/docs/static/images/logos/logicstar_symbol_navy.svg b/docs/static/images/logos/logicstar_symbol_navy.svg
@@ -0,0 +1,6 @@
+<svg width="133" height="148" viewBox="0 0 133 148" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M83.6562 59.1447L54.1165 40.651L64.242 0.0651855H69.0955L83.6562 59.1447Z" fill="#04257A"/>
+<path d="M83.6562 88.8552L54.1165 107.349L64.1583 147.935H69.0119L83.5725 88.8552H83.6562Z" fill="#04257A"/>
+<path d="M0.899658 76.3815L49.6026 88.8501L73.3683 73.9547L49.6026 59.0593L0.899658 71.5279V76.3815Z" fill="#04257A"/>
+<path d="M132.368 71.5279L83.6653 59.0593V88.8501L132.368 76.3815V71.5279Z" fill="#04257A"/>
+</svg>