Skip to content

Commit d233656

Browse files
committed
try asking manager for a fix
Signed-off-by: vsoch <[email protected]>
1 parent a0f0e45 commit d233656

File tree

11 files changed

+151
-163
lines changed

11 files changed

+151
-163
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ See [examples/agent](examples/agent) for an example.
2828

2929
**And experiment ideas**
3030

31+
- How do we define stability?
3132
- What are the increments of change (e.g., "adding a library")? We should be able to keep track of times for each stage and what changed, and an analyzer LLM can look at result and understand (categorize) most salient contributions to change.
3233
- We also can time the time it takes to do subsequent changes, when relevant. For example, if we are building, we should be able to use cached layers (and the build times speed up) if the LLM is changing content later in the Dockerfile.
3334
- We can also save the successful results (Dockerfile builds, for example) and compare for similarity. How consistent is the LLM?

examples/agent/plans/run-lammps.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
name: Build and Deploy LAMMPS
22
description: Build a Docker container and deploy it as a Kubernetes Job.
33
plan:
4+
5+
# Important: everything you want to provide to the manager agent should be defined.
6+
# Agents can pass steps in between, but the manager is always given stateless context.
47
- agent: build
58
context:
69
environment: "google cloud CPU instance in Kubernetes"
710
application: lammps
8-
# Testing max attempts for help from LLM manager
911
max_attempts: 1
1012
details: |
1113
Please build the with reaxff HNS example located in examples/reaxff/HNS.
@@ -18,6 +20,7 @@ plan:
1820
context:
1921
no_pull: true
2022
environment: "google cloud CPU instance in Kubernetes"
23+
max_attempts: 1
2124
details: |
2225
Please execute the reaxff HNS example, and assume the data in the PWD,
2326
Run lammpss with params -v x 2 -v y 2 -v z 2 -in ./in.reaxff.hns

fractale/agent/base.py

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import fractale.utils as utils
22

3-
43
class Agent:
54
"""
65
A base for an agent. Each agent should:
@@ -36,12 +35,11 @@ def return_on_failure(self):
3635
"""
3736
On failure, have we reached max attempts and should return?
3837
"""
39-
print('CHECK')
40-
import IPython
41-
IPython.embed()
38+
# Unset or 0.
4239
if not self.max_attempts:
4340
return False
44-
return self.attempts > self.max_attempts
41+
# This starts counting at 1, so we check >=
42+
return self.attempts >= self.max_attempts
4543

4644
def set_max_attempts(self, max_attempts):
4745
self.max_attempts = max_attempts
@@ -66,31 +64,34 @@ def write_file(self, context, content, add_comment=True):
6664
content += f"\n# Generated by fractale {self.name} agent"
6765
utils.write_file(content, outfile)
6866

69-
def ask_gemini(self, prompt):
67+
def get_code_block(self, content, code_type):
68+
"""
69+
Parse a code block from the response
70+
"""
71+
if content.startswith(f"```{code_type}"):
72+
content = content[len(f"```{code_type}") :]
73+
if content.startswith("```"):
74+
content = content[len("```") :]
75+
if content.endswith("```"):
76+
content = content[: -len("```")]
77+
return content
78+
79+
def ask_gemini(self, prompt, with_history=True):
7080
"""
7181
Ask gemini adds a wrapper with some error handling.
7282
"""
7383
try:
74-
response = self.chat.send_message(prompt)
84+
if with_history:
85+
response = self.chat.send_message(prompt)
86+
else:
87+
response = self.model.generate_content(prompt)
7588

7689
# This line can fail. If it succeeds, return entire response
77-
text_content = response.text
78-
assert text_content
79-
return response
90+
return response.text.strip()
8091

8192
except ValueError as e:
8293
print(f"[Error] The API response was blocked and contained no text: {str(e)}")
83-
84-
print("VANESSA DEBUG WHAT TO DO")
85-
import IPython
86-
87-
IPython.embed()
88-
# We probably want to retry if it is 1 (STOP) and empty.
89-
# Otherwise we need to somehow retry fixing the dockerfile.
90-
# For robust logging, you can inspect the reason.
91-
if response.candidates:
92-
finish_reason = response.candidates[0].finish_reason.name
93-
print(f"Finish Reason: {finish_reason}")
94+
return "GEMINI ERROR: The API returned an error (or stop) and we need to try again."
9495

9596
def run(self, context):
9697
"""
@@ -99,6 +100,14 @@ def run(self, context):
99100
assert context
100101
raise NotImplementedError(f"The {self.name} agent is missing a 'run' function")
101102

103+
def get_initial_prompt(self, context):
104+
"""
105+
Get the initial prompt (with details) to provide context to the manager.
106+
107+
If we don't do this, the manager can provide a bad instruction for how to fix the error.
108+
"""
109+
return self.get_prompt(context)
110+
102111
def get_prompt(self, context):
103112
"""
104113
This function should take the same context as run and return the parsed prompt that

fractale/agent/build/agent.py

Lines changed: 19 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,6 @@ def init(self):
4141
model = genai.GenerativeModel("gemini-2.5-pro")
4242
self.chat = model.start_chat()
4343

44-
def requires(self):
45-
"""
46-
Each agent has a requires function to tell the manager what
47-
they do and what is required in the context to run them.
48-
"""
49-
return prompts.requires
50-
5144
def add_arguments(self, subparser):
5245
"""
5346
Add arguments for the plugin to show up in argparse
@@ -125,7 +118,11 @@ def run(self, context):
125118
# This will either generate fresh or rebuild erroneous Dockerfile
126119
# We don't return the dockerfile because it is updated in the context
127120
self.generate_dockerfile(context)
128-
print(Panel(context.dockerfile, title="[green]Dockerfile[/green]", border_style="green"))
121+
print(Panel(context.dockerfile, title="[green]Dockerfile or Response[/green]", border_style="green"))
122+
123+
# Set the container on the context for a next step to use it...
124+
container = context.get("container") or self.generate_name(context.application)
125+
context.container = container
129126

130127
# Build it! We might want to only allow a certain number of retries or incremental changes.
131128
return_code, output = self.build(context)
@@ -170,7 +167,7 @@ def get_result(self, context):
170167
"""
171168
Return either the entire context or single result.
172169
"""
173-
if context.get("managed") is True:
170+
if context.is_managed:
174171
return context
175172
return context.dockerfile
176173

@@ -216,10 +213,6 @@ def build(self, context):
216213
Build the Dockerfile! Yolo!
217214
"""
218215
dockerfile = context.get("dockerfile")
219-
image_name = context.get("container") or self.generate_name(context.application)
220-
221-
# Set the container on the context for follow up steps.
222-
context.container = image_name
223216

224217
# Not sure if this can happen, assume it can
225218
if not dockerfile:
@@ -230,17 +223,20 @@ def build(self, context):
230223

231224
# Write the Dockerfile to the temporary directory
232225
utils.write_file(dockerfile, os.path.join(build_dir, "Dockerfile"))
233-
print(
234-
Panel(
235-
f"Attempt {self.attempts} to build image: [bold cyan]{image_name}[/bold cyan]",
236-
title="[blue]Docker Build[/blue]",
237-
border_style="blue",
226+
227+
# If only one max attempt, don't print here, not important to show.
228+
if self.max_attempts is not None and self.max_attempts > 1:
229+
print(
230+
Panel(
231+
f"Attempt {self.attempts} to build image: [bold cyan]{context.container}[/bold cyan]",
232+
title="[blue]Docker Build[/blue]",
233+
border_style="blue",
234+
)
238235
)
239-
)
240236

241237
# Run the build process using the temporary directory as context
242238
p = subprocess.run(
243-
["docker", "build", "--network", "host", "-t", image_name, "."],
239+
["docker", "build", "--network", "host", "-t", context.container, "."],
244240
capture_output=True,
245241
text=True,
246242
cwd=build_dir,
@@ -259,18 +255,12 @@ def generate_dockerfile(self, context):
259255
print(textwrap.indent(prompt, "> ", predicate=lambda _: True))
260256

261257
# The API can error and not return a response.text.
262-
response = self.ask_gemini(prompt)
258+
content = self.ask_gemini(prompt)
263259
print("Received Dockerfile response from Gemini...")
264260

265261
# Try to remove Dockerfile from code block
266262
try:
267-
content = response.text.strip()
268-
if content.startswith("```dockerfile"):
269-
content = content[len("```dockerfile") :]
270-
if content.startswith("```"):
271-
content = content[len("```") :]
272-
if content.endswith("```"):
273-
content = content[: -len("```")]
263+
content = self.get_code_block(content, 'dockerfile')
274264

275265
# If we are getting commentary...
276266
match = re.search(dockerfile_pattern, content, re.DOTALL)
@@ -283,4 +273,4 @@ def generate_dockerfile(self, context):
283273
context.dockerfile = dockerfile
284274
context.result = dockerfile
285275
except Exception as e:
286-
sys.exit(f"Error parsing response from Gemini: {e}\n{response.text}")
276+
sys.exit(f"Error parsing response from Gemini: {e}\n{content}")

fractale/agent/build/prompts.py

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,33 +25,27 @@
2525
- Don't worry about users/permissions - just be root.
2626
"""
2727

28+
29+
# TODO: do we want to add back common instructions here?
2830
rebuild_prompt = (
29-
f"""Act as a Dockerfile builder service expert. I am trying to build a Docker image named for the application '%s' in an environment for '%s'. The previous attempt to build or run the Dockerfile failed. Here is the problematic Dockerfile:
31+
f"""Your previous Dockerfile build has failed. Here is instruction for how to fix it.
3032
31-
```dockerfile
32-
%s
33-
```
33+
Please analyze the instruction and your previous Dockerfile, and provide a corrected version.
34+
- The response should only contain the complete, corrected Dockerfile inside a single markdown code block.
35+
- Use succinct comments in the Dockerfile to explain build logic and changes.
36+
- Follow the same guidelines as previously instructed.
3437
35-
Here is the error message I received:
36-
```
3738
%s
38-
```
39-
40-
Please analyze the error and the Dockerfile, and provide a corrected version.
41-
- The response should only contain the complete, corrected Dockerfile inside a single markdown code block.
42-
- Use succinct comments in the Dockerfile to explain build logic and changes.
4339
"""
44-
+ common_instructions
4540
)
4641

4742

4843
def get_rebuild_prompt(context):
49-
environment = context.get("environment", defaults.environment)
50-
application = context.get("application", required=True)
51-
return prompt_wrapper(
52-
rebuild_prompt % (application, environment, context.dockerfile, context.error_message),
53-
context=context,
54-
)
44+
"""
45+
The rebuild prompt will either be the entire error output, or the parsed error
46+
output with help from the agent manager.
47+
"""
48+
return prompt_wrapper(rebuild_prompt % context.error_message, context=context)
5549

5650

5751
build_prompt = (

fractale/agent/context.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,15 @@ def reset(self):
3232
"""
3333
Reset the return code and result.
3434
"""
35-
for key in ["return_code", "result"]:
35+
for key in ["return_code", "result", "error_message"]:
3636
self.data[key] = None
3737

38+
def is_managed(self):
39+
"""
40+
Is the context being managed?
41+
"""
42+
return self.get("managed") is True
43+
3844
def __getattribute__(self, name):
3945
"""
4046
Intercepts all attribute lookups (including methods/functions)

fractale/agent/kubernetes_job/agent.py

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from rich.panel import Panel
1515
from rich.syntax import Syntax
1616

17-
import fractale.agent.defaults as defaults
1817
import fractale.agent.kubernetes_job.prompts as prompts
1918
import fractale.utils as utils
2019
from fractale.agent.base import Agent
@@ -77,13 +76,6 @@ def init(self):
7776
model = genai.GenerativeModel("gemini-2.5-pro")
7877
self.chat = model.start_chat()
7978

80-
def requires(self):
81-
"""
82-
Each agent has a requires function to tell the manager what
83-
they do and what is required in the context to run them.
84-
"""
85-
return prompts.requires
86-
8779
def get_prompt(self, context):
8880
"""
8981
Get the prompt for the LLM. We expose this so the manager can take it
@@ -144,16 +136,31 @@ def run(self, context):
144136
print("\n[bold cyan] Requesting Correction from Kubernetes Job Agent[/bold cyan]")
145137
self.attempts += 1
146138

139+
# Return early based on max attempts
140+
if self.return_on_failure():
141+
context.return_code = -1
142+
context.result = output
143+
return self.get_result(context)
144+
147145
# Trigger again, provide initial context and error message
146+
# This is the internal loop running, no manager agent
148147
context.error_message = output
149148
context.job_crd = job_crd
150149
return self.run(context)
151150

152151
self.write_file(context, job_crd)
153152
self.print_crd(job_crd)
154-
if context.get("managed") is True:
153+
return self.get_result(context)
154+
155+
156+
def get_result(self, context):
157+
"""
158+
Return either the entire context or single result.
159+
"""
160+
if context.is_managed:
155161
return context
156-
return job_crd
162+
return context.job_crd
163+
157164

158165
def print_crd(self, job_crd):
159166
"""
@@ -424,26 +431,20 @@ def deploy(self, job_crd, image_name, cleanup=True):
424431
shutil.rmtree(deploy_dir, ignore_errors=True)
425432
return (0, "Success")
426433

427-
def generate_crd(self, context, template=None):
434+
def generate_crd(self, context):
428435
"""
429436
Generates or refines an existing Job CRD using the Gemini API.
430437
"""
431438
prompt = self.get_prompt(context)
432439
print("Sending generation prompt to Gemini...")
433440
print(textwrap.indent(prompt, "> ", predicate=lambda _: True))
434441

435-
response = self.ask_gemini(prompt)
442+
content = self.ask_gemini(prompt)
436443
print("Received response from Gemini...")
437444

438445
# Try to remove Dockerfile from code block
439446
try:
440-
content = response.text.strip()
441-
if content.startswith("```yaml"):
442-
content = content[len("```yaml") :]
443-
if content.startswith("```"):
444-
content = content[len("```") :]
445-
if content.endswith("```"):
446-
content = content[: -len("```")]
447+
content = self.get_code_block(content, 'yaml')
447448

448449
# If we are getting commentary...
449450
match = re.search(yaml_pattern, content, re.DOTALL)
@@ -455,4 +456,4 @@ def generate_crd(self, context, template=None):
455456
return job_crd
456457

457458
except Exception as e:
458-
sys.exit(f"Error parsing response from Gemini: {e}\n{response.text}")
459+
sys.exit(f"Error parsing response from Gemini: {e}\n{content}")

0 commit comments

Comments
 (0)