InternScience
diff --git a/‎md_images/data_distribution.png‎
-1.96 KB b/‎md_images/data_distribution.png‎
-1.96 KB
diff --git a/‎paper/imgs/data_distribution.png‎
-511 Bytes b/‎paper/imgs/data_distribution.png‎
-511 Bytes
diff --git a/‎paper/sections/2-benchmark.tex‎
Lines changed: 2 additions & 1 deletion b/‎paper/sections/2-benchmark.tex‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎paper/sections/X-appendix.tex‎
Lines changed: 10 additions & 10 deletions b/‎paper/sections/X-appendix.tex‎
Lines changed: 10 additions & 10 deletions
@@ -24,6 +24,7 @@ \section{Scientific General Intelligence: Concept and Operational Definition}
 
 SGI-Bench departs from conventional benchmarks that emphasize factual recall or single-turn reasoning. Instead, it operationalizes the long-horizon workflow of scientific discovery into four interdependent stages: literature review(Deliberation), methodology design(conception), experiment implementation(Action), and experimental analysis(Perception). These stages correspond to fundamental capabilities required of AI systems: information integration and understanding(Scientific Deep Research), design and planning(Idea Generation), experimental execution(AI-Assisted Scientific Experiment), and reasoning-based interpretation(Scientific Experimental Reasoning). Together, they form a unified framework that measures not only what models know but how they think, plan, and adapt in pursuit of new knowledge. 
 
+
 \begin{figure}[ht]
 % \vspace{-0.5em}
 \centerline
@@ -462,7 +463,7 @@ \subsubsection{Metrics of AI-Assisted Scientific Experiment}
 
 \paragraph{Dry Experiment}
 \label{sec: Metric of Dry Experiment}
-Dry experiments focus on code generation task. Specifically, each problem includes background information, data code, and main code with certain functions masked. The model is tasked with completing the missing functions. Each problem contains 5 unit tests. Our metrics capture both correctness and execution behavior of the generated code.~\cite{jain2024livecodebenchholisticcontaminationfree}
+Dry experiments focus on code generation task. Specifically, each problem includes background information, data code, and main code with certain functions masked. The model is tasked with completing the missing functions. Each problem contains 5 unit tests. Our metrics capture both correctness and execution behavior of the generated code~\cite{jain2024livecodebenchholisticcontaminationfree}.
 
 \begin{tcolorbox}[
     breakable,
 
@@ -3377,7 +3377,7 @@ \subsection{Supplementary Evaluation Results}
 \bottomrule
 \end{tabular}
 }
-\caption{\textbf{Deep Research Task Metrics (LLMs)}: Category-wise scores across Properties, Micro/Macro-Experiments, and Data.}
+\caption{\textbf{Deep Research Task Metrics (LLMs)}: Category-wise scores across Properties, Micro/Macro-Experiments, and Data. Note: Because different subjects have different characteristics, the number of questions in each category is not the same (Figure~\ref{fig: data_distribution}). Therefore, the overall performance of the model cannot be obtained by directly averaging the values in the table.}
 \label{tab:llms_deep_research_task_metric}
 \end{table}
 
@@ -3408,7 +3408,7 @@ \subsection{Supplementary Evaluation Results}
 \bottomrule
 \end{tabular}
 }
-\caption{\textbf{Deep Research Task Metrics (Agents)}: Category-wise scores across Properties, Micro/Macro-Experiments, and Data.}
+\caption{\textbf{Deep Research Task Metrics (Agents)}: Category-wise scores across Properties, Micro/Macro-Experiments, and Data. Note: Because different subjects have different characteristics, the number of questions in each category is not the same (Figure~\ref{fig: data_distribution}). Therefore, the overall performance of the model cannot be obtained by directly averaging the values in the table.}
 \label{tab:agents_deep_research_task_metric}
 \end{table}
 
@@ -3449,7 +3449,7 @@ \subsection{Supplementary Evaluation Results}
 \bottomrule
 \end{tabular}
 }
-\caption{\textbf{Dry Experiment Function Categories}: Completion scores across six function types.}
+\caption{\textbf{Dry Experiment Function Categories}: Completion scores across six function types. Note: Because different subjects have different characteristics, the number of questions in each category is not the same (Figure~\ref{fig: data_distribution}). Therefore, the overall performance of the model cannot be obtained by directly averaging the values in the table.}
 \label{tab:dry_task_metric_table}
 \end{table}
 
@@ -3485,7 +3485,7 @@ \subsection{Supplementary Evaluation Results}
 \bottomrule
 \end{tabular}
 }
-\caption{\textbf{Experimental Reasoning by Type (Multi-choice Accuracy)}: Scores across signal, attribute, comparative, and causal reasoning.}
+\caption{\textbf{Experimental Reasoning by Type (Multi-choice Accuracy)}: Scores across signal, attribute, comparative, and causal reasoning. Note: Because different subjects have different characteristics, the number of questions in each category is not the same (Figure~\ref{fig: data_distribution}). Therefore, the overall performance of the model cannot be obtained by directly averaging the values in the table.}
 \label{tab:mcp_task_metric_table}
 \end{table}
 
@@ -3526,7 +3526,7 @@ \subsection{Supplementary Evaluation Results}
 \bottomrule
 \end{tabular}
 }
-\caption{\textbf{Deep Research Across Subjects (LLMs)}: Subject-wise scores across ten scientific domains.}
+\caption{\textbf{Deep Research Across Subjects (LLMs)}: Subject-wise scores across ten scientific domains. Note: Because different subjects have different characteristics, the number of questions in each category is not the same (Figure~\ref{fig: data_distribution}). Therefore, the overall performance of the model cannot be obtained by directly averaging the values in the table.}
 \label{tab:llms_deep_research_subject_metric_table}
 \end{table}
 
@@ -3557,7 +3557,7 @@ \subsection{Supplementary Evaluation Results}
 \bottomrule
 \end{tabular}
 }
-\caption{\textbf{Deep Research Across Subjects (Agents)}: Subject-wise scores across ten scientific domains.}
+\caption{\textbf{Deep Research Across Subjects (Agents)}: Subject-wise scores across ten scientific domains. Note: Because different subjects have different characteristics, the number of questions in each category is not the same (Figure~\ref{fig: data_distribution}). Therefore, the overall performance of the model cannot be obtained by directly averaging the values in the table.}
 \label{tab:agents_deep_research_subject_metric_table}
 \end{table}
 
@@ -3598,7 +3598,7 @@ \subsection{Supplementary Evaluation Results}
 \bottomrule
 \end{tabular}
 }
-\caption{\textbf{Idea Generation Across Subjects}: Subject-wise scores.}
+\caption{\textbf{Idea Generation Across Subjects}: Subject-wise scores. Note: Because different subjects have different characteristics, the number of questions in each category is not the same (Figure~\ref{fig: data_distribution}). Therefore, the overall performance of the model cannot be obtained by directly averaging the values in the table.}
 \label{tab:idea_subject_metric_table}
 \end{table}
 
@@ -3639,7 +3639,7 @@ \subsection{Supplementary Evaluation Results}
 \bottomrule
 \end{tabular}
 }
-\caption{\textbf{Dry Experiment Across Subjects}: Subject-wise scores.}
+\caption{\textbf{Dry Experiment Across Subjects}: Subject-wise scores. Note: Because different subjects have different characteristics, the number of questions in each category is not the same (Figure~\ref{fig: data_distribution}). Therefore, the overall performance of the model cannot be obtained by directly averaging the values in the table.}
 \label{tab:dry_subject_metric_table2}
 \end{table}
 
@@ -3680,7 +3680,7 @@ \subsection{Supplementary Evaluation Results}
 \bottomrule
 \end{tabular}
 }
-\caption{\textbf{Wet Experiment Across Subjects}: Scores across Action Sequence Similarity (SS) and Parameter Accuracy (PA) categories.}
+\caption{\textbf{Wet Experiment Across Subjects}: Scores across Action Sequence Similarity (SS) and Parameter Accuracy (PA) categories. Note: Because different subjects have different characteristics, the number of questions in each category is not the same (Figure~\ref{fig: data_distribution}). Therefore, the overall performance of the model cannot be obtained by directly averaging the values in the table.}
 \label{tab:wet_subject_metric_table}
 \end{table}
 
@@ -3716,6 +3716,6 @@ \subsection{Supplementary Evaluation Results}
 \bottomrule
 \end{tabular}
 }
-\caption{\textbf{Experimental Reasoning Across Subjects (Multi-choice Accuracy)}: Subject-wise scores across 10 scientific disciplines.}
+\caption{\textbf{Experimental Reasoning Across Subjects (Multi-choice Accuracy)}: Subject-wise scores across 10 scientific disciplines. Note: Because different subjects have different characteristics, the number of questions in each category is not the same (Figure~\ref{fig: data_distribution}). Therefore, the overall performance of the model cannot be obtained by directly averaging the values in the table.}
 \label{tab:mcp_subject_metric_table}
 \end{table}