Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion rdagent/core/evolving_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,17 @@ def multistep_evolve(
# TODO: Putting the evolving trace in here doesn't actually work
queried_knowledge = self.rag.query(evo, self.evolving_trace)

# 2.5 Brief evolving history
evolving_history = (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we defer therender it to last step

len(self.evolving_trace) + 1,
"\n".join(f"### Evolving Step {i + 1}\n{trace}" for i, trace in enumerate(self.evolving_trace)),
)

# 3. evolve
evo = self.evolving_strategy.evolve(
evo=evo,
evolving_trace=self.evolving_trace,
evolving_history=evolving_history,
queried_knowledge=queried_knowledge,
)

Expand All @@ -92,7 +99,7 @@ def multistep_evolve(
# 5. Evaluation
if self.with_feedback:
es.feedback = (
eva if isinstance(eva, Feedback) else eva.evaluate(evo, queried_knowledge=queried_knowledge)
eva if isinstance(eva, Feedback) else eva.evaluate(evo, queried_knowledge=queried_knowledge, evolving_history=evolving_history)
)
logger.log_object(es.feedback, tag="evolving feedback")

Expand Down
7 changes: 7 additions & 0 deletions rdagent/core/evolving_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ class EvoStep:
evolvable_subjects: EvolvableSubjects
queried_knowledge: QueriedKnowledge | None = None
feedback: Feedback | None = None
code_change_summary: str | None = None # TODO: minrui
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this place is a good place to add summary,
Let's have the discussion later.


def __str__(self) -> str:
return f"""{str(self.feedback)}
### Summary of Code Change
{self.code_change_summary}
"""


class EvolvingStrategy(ABC):
Expand Down
32 changes: 17 additions & 15 deletions rdagent/scenarios/data_science/dev/runner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def implement_one_task(
self,
target_task: CoSTEERTask,
queried_knowledge: CoSTEERQueriedKnowledge | None = None,
evolving_history: tuple = None,
workspace: FBWorkspace | None = None,
prev_task_feedback: CoSTEERSingleFeedback | None = None,
) -> dict[str, str]:
Expand All @@ -52,26 +53,28 @@ def implement_one_task(
# if no prev_task_feedback, it is the first loop; we do not make any changes and goto evaluators directly.
return {}

# Output Agent Map
output_map = {
True: (PythonBatchPatchOut.get_spec(), PythonBatchPatchOut.extract_output),
False: (
PythonBatchEditOut.get_spec(with_del=False),
PythonBatchEditOut.extract_output,
),
}
output_spec, extract_output_fn = output_map[self.settings.diff_mode]
# Set output agent
if self.settings.diff_mode:
output_spec = PythonBatchPatchOut.get_spec()
extract_output_fn = PythonBatchPatchOut.extract_output
else:
output_spec = PythonBatchEditOut.get_spec(with_del=False)
extract_output_fn = PythonBatchEditOut.extract_output

if prev_task_feedback.hyperparameter_tuning_decision:
# Use system_refine for hyperparameter tuning
system_prompt = T(".prompts:DSCoSTEER.system_refine").r(
max_loop=DS_RD_SETTING.runner_max_loop,
cur_loop=evolving_history[0],
out_spec=output_spec,
diff_mode=self.settings.diff_mode,
)
else:
task_information_str = target_task.get_task_information()
# Use system_debugger for error fixing and debugging
system_prompt = T(".prompts:DSCoSTEER.system_refine").r(
max_loop=DS_RD_SETTING.runner_max_loop,
cur_loop=evolving_history[0],
task_desc=task_information_str,
out_spec=output_spec,
diff_mode=self.settings.diff_mode,
Expand All @@ -81,15 +84,14 @@ def implement_one_task(
user_prompt = T(".prompts:DSCoSTEER.user").r(
code=workspace.all_codes,
feedback=prev_task_feedback,
evolving_history=evolving_history[1],
hyperparameter_tuning_suggestion=prev_task_feedback.hyperparameter_tuning_suggestion,
)

batch_edit = extract_output_fn(
APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=system_prompt,
)
resp = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=system_prompt,
)
batch_edit = extract_output_fn(resp["code"])

batch_edit = {k: v for k, v in batch_edit.items() if k in workspace.file_dict.keys()}

Expand Down
22 changes: 21 additions & 1 deletion rdagent/scenarios/data_science/dev/runner/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
)
from rdagent.components.coder.data_science.conf import get_clear_ws_cmd, get_ds_env
from rdagent.components.coder.data_science.utils import remove_eda_part
from rdagent.core.evolving_framework import QueriedKnowledge
from rdagent.core.evolving_framework import EvoStep, QueriedKnowledge
from rdagent.core.experiment import FBWorkspace, Task
from rdagent.log import rdagent_logger as logger
from rdagent.scenarios.data_science.test_eval import (
Expand Down Expand Up @@ -39,6 +39,22 @@ def __init__(
self.hyperparameter_tuning_decision = hyperparameter_tuning_decision
self.hyperparameter_tuning_suggestion = hyperparameter_tuning_suggestion

def __str__(self) -> str:
parts = [
"### Execution",
str(self.execution),
"### Return Check",
self.return_checking if self.return_checking is not None else "No return checking",
"### Code",
str(self.code),
"### Final Decision",
f"This implementation is {'SUCCESS' if self.final_decision else 'FAIL'}.",
]
if self.hyperparameter_tuning_decision:
parts.append("### Hyperparameter Tuning Suggestion")
parts.append(str(self.hyperparameter_tuning_suggestion))
return "\n".join(parts)


class DSCoSTEERCoSTEEREvaluator(CoSTEEREvaluator):

Expand All @@ -48,6 +64,7 @@ def evaluate(
implementation: FBWorkspace,
gt_implementation: FBWorkspace,
queried_knowledge: QueriedKnowledge = None,
evolving_history: tuple = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

eolving_trace

**kwargs,
) -> DSCoSTEEREvalFeedback:

Expand Down Expand Up @@ -130,6 +147,8 @@ def evaluate(
stdout += f"\nSubmission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "

system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
max_loop=DS_RD_SETTING.runner_max_loop,
cur_loop=evolving_history[0],
scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
task_desc=target_task.get_task_information(),
Expand All @@ -141,6 +160,7 @@ def evaluate(
time_spent=f"{implementation.running_info.running_time:.2f} seconds",
timeout=f"{env.conf.running_timeout_period} seconds",
percent_of_timeout_used=f"{(implementation.running_info.running_time / env.conf.running_timeout_period) * 100:.2f}%",
evolving_history=evolving_history[1],
)

feedback = build_cls_from_json_with_retry(
Expand Down
101 changes: 62 additions & 39 deletions rdagent/scenarios/data_science/dev/runner/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,25 @@ DSCoSTEER_eval:
system: |-
{% include "scenarios.data_science.share:scen.role" %}
{% if is_sub_enabled %}
You have successfully implemented the workflow on a sampled dataset and are now transitioning to the full dataset.
The code base will be iteratively improved through a series of [coding] and [feedback] steps.
The maximum number of evolution steps is {{ max_loop }}, and you are currently on [feedback] step {{ cur_loop }}.

You will be provided with:
1. `Code base`: The code base of the solution
2. `The stdout of code execution and testing`: The generated stdout when executing the code base and corresponding testing
3, `The time spent on code execution`: The time spent on the code execution
4. `The timeout of code execution`: the time limitation of the code execution
5. `The percent of timeout used`: the percentage of the time limitation used
Your task is to perform the following evaluation(s):

# Evalution 1: Code Correctness
1. The current code base you need to evaluate.
2. The stdout of the current code execution and testing.
3. The time spent on the current code execution, along with the total timeout and the percent of timeout used for current code execution.
4. The evolving history, which includes summaries of previous [coding] and [feedback] steps.

Your task is to perform the following evaluations:

# Evaluation 1: Code Correctness
## Scenario
The code is focusing on the following scenario:
{{ scenario }}

## Target Task Description
The code is focusing on the following task
The code is targeting on the following task
{{ task_desc }}

## Runtime Environment
Expand All @@ -39,12 +43,13 @@ DSCoSTEER_eval:
The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
For example, if the code uses only a very small portion of the allowed time, and hyperparameters like `n_estimators` or `epochs` have low values, with early stopping not being triggered and possible signs of underfitting, you should suggest increasing these hyperparameters.
You should also notice other resources utilization hyper-parameters,
For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.
For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.

## Evaluation Guidelines
1. The code execution time or resource utilization suggest that there is room for improvement in the hyperparameters.
2. The code must apply early stopping strategy already (in order to prevent overfitting).
3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence. If there are no obvious and impactful opportunities and the code runs well, please accept it.
2. The code must apply early stopping strategy already (in order to prevent overfitting).
3. Carefully review the entire evolving history to avoid repeating the same mistakes.
4. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence. If there are no obvious and impactful opportunities and the code runs well, please accept it.
If the code satisfy the requirements:
- Set "hyperparameter_tuning_decision" to true.
- Set "final_decision" to false.
Expand Down Expand Up @@ -81,7 +86,6 @@ DSCoSTEER_eval:
}
```
{% endif %}
# NOTE: when is_sub_enabled == False, we don't have any checking about the return. So it is just placeholder currently

user: |-
# Code base
Expand All @@ -90,27 +94,32 @@ DSCoSTEER_eval:
## Stdout of code execution and testing
{{ stdout }}

# The time spend on code execution and timeout
{{ time_spent }}
## Execution time and timeout
The execution time for current code base: {{ time_spent }}.
The total timeout: {{ timeout }}.
The percent of timeout used: {{ percent_of_timeout_used }}.

## The timeout of code execution
{{ timeout }}

## The percent of timeout used
{{ percent_of_timeout_used }}
## Evolving History
{{ evolving_history }}


DSCoSTEER:
system_debugger: |-
{% include "scenarios.data_science.share:scen.role" %}
You have finished the implementation of the whole workflow which has executed well on a sampled dataset. Now we are working on the full dataset.
The user has reported that the workflow failed to execute on the full dataset.
Your will be provided with:
1. Code base.
2. Task description, which is the task the code is trying to solve.
3. Feedback generated during the execution of the whole workflow.
4. Suggestions for hyperparameter tuning.
Your job is to debug the whole code base, try to correct the errors, and ensure that the workflow can execute successfully on the full dataset.
You have successfully implemented the workflow on a sampled dataset and are now transitioning to the full dataset.
The code base will be iteratively improved through a series of [coding] and [feedback] steps.
The maximum number of evolution steps is {{ max_loop }}, and you are currently on [coding] step {{ cur_loop }}.
The previous [feedback] step indicates that the code failed to execute successfully on the full dataset.

Your will be provided with:
1. The current code base you need to refine.
2. The task description, which is the task the code is trying to solve.
3. The feedback after executing the code base.
4. The evolving history, which includes summaries of previous [coding] and [feedback] steps.

Your job is to:
1. Debug the whole code base, try to correct the errors, and ensure that the workflow can execute successfully on the full dataset.
2. Summarize the changes you made to the original code base.
## Task description
{{ task_desc }}

Expand All @@ -121,42 +130,53 @@ DSCoSTEER:
{% else %}
2. You must output the COMPLETE and FULL code. Do not truncate, summarize, or omit any parts of the code. Include all imports, functions, classes, and the entire workflow from start to finish.
{% endif %}
3. Write a concise and structured code change summary. Clearly describe what was changed, specifying exactly what was changed from what to what (e.g., "Changed batch_size from 32 to 128"). Briefly explain the reasoning behind each modification.

## Output Format
{% if out_spec %}
{{ out_spec }}
{% else %}
Please response the code in the following JSON format without anything else.
{
"code": "The Python code as a string."
"code": "The refined Python code as a string."
"code_change_summary": "The structured summary to briefly summarize the changes made to the original code base in two to three sentences."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remember change

}
{% endif %}

system_refine: |-
{% include "scenarios.data_science.share:scen.role" %}
You have finished the implementation of the whole workflow which has executed well on a sampled dataset. Now we are working on the full dataset.
The user has reported that the hyperparameters are not reasonable and the code didn't make the best use of the time limit.
You have successfully implemented the workflow on a sampled dataset and are now transitioning to the full dataset.
The code base will be iteratively improved through a series of [coding] and [feedback] steps.
The maximum number of evolution steps is {{ max_loop }}, and you are currently on [coding] step {{ cur_loop }}.
The previous [feedback] step indicates that the code executed successfully, but there are opportunities to improve performance through hyperparameter tuning.

Your will be provided with:
1. Code base.
2. Feedback generated during the execution of the whole workflow.
3. Suggestions for hyperparameter tuning.
Your task is to refine the code base and modify the hyperparameters based on the feedback and suggestions.
1. The current code base you need to refine.
2. The feedback after executing the code base.
3. The suggestions for hyperparameter tuning.
4. The evolving history, which includes summaries of previous [coding] and [feedback] steps.

Your task is to:
1. Refine the code base and modify the hyperparameters based on the feedback, suggestions, and evolving history.
2. Summarize the changes you made to the original code base.

## Instructions
1. Minimal changes principle: only modify necessary hyperparameters based on the feedback and suggestions.
1. Minimal changes principle: only modify necessary hyperparameters based on the feedback, suggestions, and evolving history.
{% if diff_mode %}
2. You must output in Code Diff format. The detailed format specification is as follows.
2. You must output the code in V4A diff format. The detailed format specification is as follows.
{% else %}
2. You must output the COMPLETE and FULL code. Do not truncate, summarize, or omit any parts of the code. Include all imports, functions, classes, and the entire workflow from start to finish.
{% endif %}
3. Write a concise and structured code change summary. Clearly describe what was changed, specifying exactly what was changed from what to what (e.g., "Changed batch_size from 32 to 128"). Briefly explain the reasoning behind each modification.

## Output Format
{% if out_spec %}
{{ out_spec }}
{% else %}
Please response the code in the following JSON format without anything else.
{
"code": "The Python code as a string."
"code": "The refined Python code as a string."
"code_change_summary": "The structured summary to briefly summarize the changes made to the original code base in two to three sentences."
}
{% endif %}

Expand All @@ -167,7 +187,10 @@ DSCoSTEER:
## Feedback
{{ feedback }}

## Evolving History
{{ evolving_history }}

{% if hyperparameter_tuning_suggestion is not none %}
## Hyperparameter Tuning Suggestion
{{ hyperparameter_tuning_suggestion }}
{% endif %}
{% endif %}
Loading