compspec
diff --git a/‎fractale/agent/base.py
Lines changed: 3 additions & 1 deletion b/‎fractale/agent/base.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎fractale/agent/kubernetes/__init__.py
Lines changed: 2 additions & 1 deletion b/‎fractale/agent/kubernetes/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎fractale/agent/kubernetes/base.py
Lines changed: 60 additions & 2 deletions b/‎fractale/agent/kubernetes/base.py
Lines changed: 60 additions & 2 deletions
diff --git a/‎fractale/agent/kubernetes/job/agent.py
Lines changed: 72 additions & 16 deletions b/‎fractale/agent/kubernetes/job/agent.py
Lines changed: 72 additions & 16 deletions
diff --git a/‎fractale/agent/kubernetes/job/prompts.py
Lines changed: 42 additions & 0 deletions b/‎fractale/agent/kubernetes/job/prompts.py
Lines changed: 42 additions & 0 deletions
diff --git a/‎fractale/agent/kubernetes/objects.py
Lines changed: 1 addition & 1 deletion b/‎fractale/agent/kubernetes/objects.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎fractale/agent/manager/agent.py
Lines changed: 4 additions & 5 deletions b/‎fractale/agent/manager/agent.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎fractale/agent/optimize/__init__.py
Lines changed: 1 addition & 0 deletions b/‎fractale/agent/optimize/__init__.py
Lines changed: 1 addition & 0 deletions
@@ -49,7 +49,7 @@ def __init__(
         self.init()
 
     def init_metadata(self):
-        self.metadata = {"times": {}, "assets": {}, "ask_gemini": [], "retries": 0, "failures": []}
+        self.metadata = {"times": {}, "assets": {}, "retries": 0, "failures": []}
 
     @save_result
     def run(self, context):
@@ -294,6 +294,8 @@ def save_gemini_metadata(self, elapsed_time, response, with_history):
         """
         Save gemini response metadata and elapsed time
         """
+        if "ask_gemini" not in self.metadata:
+            self.metadata["ask_gemini"] = []
         self.metadata["ask_gemini"].append(
             {
                 "conversation_history": with_history,
 
@@ -1,2 +1,3 @@
 from .job import KubernetesJobAgent
-assert KubernetesJobAgent
+
+assert KubernetesJobAgent
@@ -1,4 +1,9 @@
 import argparse
+import json
+import subprocess
+
+from rich import print
+from rich.panel import Panel
 from rich.syntax import Syntax
 
 import fractale.agent.logger as logger
@@ -45,7 +50,6 @@ def print_result(self, job_crd):
             highlighted_syntax, title="Final Kubernetes Job", border_style="green", expand=True
         )
 
-
     def save_log(self, full_logs):
         """
         Save logs to metadata
@@ -64,4 +68,58 @@ def save_job_manifest(self, job):
                 self.metadata["assets"][self.result_type] = []
             self.metadata["assets"][self.result_type].append(
                 {"item": job, "attempt": self.attempts}
-            )
+            )
+
+    def cluster_resources(self):
+        """
+        Get cluster resources - count of nodes and resources.
+        I was thinking of caching this, but clusters can change,
+        and it's easy (and inexpensive) enough to query that we repeat.
+        """
+        print("[yellow]Querying Kubernetes cluster for node resources...[/yellow]")
+        try:
+            # Execute the kubectl command
+            result = subprocess.run(
+                ["kubectl", "get", "nodes", "-o", "json"],
+                capture_output=True,
+                text=True,
+                check=True,
+                timeout=30,
+            )
+
+            # Parse the JSON output
+            nodes_data = json.loads(result.stdout)
+            nodes = nodes_data.get("items", [])
+
+            if not nodes:
+                print("[red]Error: No nodes found in the cluster.[/red]")
+                return None
+
+            # Keep a listing (with count) of node specs
+            # The key is the cpu, memory, and arch, and then node count
+            node_specs = {}
+            for node in nodes:
+                node_spec = (
+                    node["status"]["allocatable"]["cpu"],
+                    node["status"]["allocatable"]["memory"],
+                    node["status"]["nodeInfo"]["architecture"],
+                )
+                if node_spec not in node_specs:
+                    node_specs[node_spec] = 0
+                node_specs[node_spec] += 1
+
+            # Ensure we expand the resources
+            node_specs = [
+                {"cpu": x[0], "memory": x[1], "arch": x[2], "count": v}
+                for x, v in node_specs.items()
+            ]
+            cluster_info = {"total_nodes": len(nodes), "node_specs": node_specs}
+
+            print("[green]✅ Successfully retrieved cluster information.[/green]")
+            return cluster_info
+
+        except Exception as e:
+            print(
+                f"[bold red]Error executing kubectl command. Do you have access to the cluster?[/bold red]"
+            )
+            print(f"Stderr: {e.stderr}")
@@ -1,7 +1,5 @@
-import argparse
 import json
 import os
-import re
 import shutil
 import subprocess
 import sys
@@ -11,17 +9,16 @@
 
 import yaml
 from rich import print
-from rich.syntax import Syntax
 
-import fractale.agent.kubernetes.objects as objects
-from fractale.agent.kubernetes.base import KubernetesAgent
 import fractale.agent.kubernetes.job.prompts as prompts
+import fractale.agent.kubernetes.objects as objects
 import fractale.agent.logger as logger
 import fractale.utils as utils
-from fractale.agent.base import GeminiAgent
 from fractale.agent.context import get_context
 from fractale.agent.decorators import timed
 from fractale.agent.errors import DebugAgent
+from fractale.agent.kubernetes.base import KubernetesAgent
+from fractale.agent.optimize import OptimizationAgent
 
 
 class KubernetesJobAgent(KubernetesAgent):
@@ -33,6 +30,13 @@ class KubernetesJobAgent(KubernetesAgent):
     description = "Kubernetes Job agent"
     result_type = "kubernetes-job-manifest"
 
+    def __init__(self, *args, **kwargs):
+        """
+        Add the optimization agent, even if we don't need it.
+        """
+        super().__init__(*args, **kwargs)
+        self.optimize_agent = OptimizationAgent()
+
     def get_prompt(self, context):
         """
         Get the prompt for the LLM. We expose this so the manager can take it
@@ -115,15 +119,19 @@ def get_diagnostics(self, job, pod):
         Helper to collect rich error data for a failed job.
         """
         print("[yellow]Gathering diagnostics for failed job...[/yellow]")
-        pod_status = pod.get_filtered_status()
+        pod_events = []
+        pods_description = ""
+        if pod is not None:
+            pod_status = pod.get_filtered_status()
+            pod_events = pod.get_events()
+            pods_description = json.dumps(pod_status)
+
         job_status = job.get_filtered_status()
 
         # Use json.dumps because it's more compact (maybe fewer tokens)
-        pod_events = pod.get_events()
         job_events = job.get_events()
         events = sorted(job_events + pod_events, key=lambda e: e.get("lastTimestamp", ""))
         job_description = json.dumps(job_status)
-        pods_description = json.dumps(pod_status)
         events_description = json.dumps(events)
         full_logs = job.get_logs()
 
@@ -139,7 +147,6 @@ def deploy(self, context):
         Deploy the Kubernetes Job.
         """
         job_crd = context.result
-        cleanup = context.get("cleanup", True)
 
         # Not sure if this can happen, assume it can
         if not job_crd:
@@ -189,6 +196,12 @@ def deploy(self, context):
             job_data["spec"]["template"]["spec"]["containers"][0]["command"] = ["sleep", "infinity"]
             job_crd = yaml.dump(job_data)
 
+        # Create job objects (and eventually pod)
+        # But ensure we delete any that might exist from before.
+        job = objects.KubernetesJob(job_name, namespace)
+        job.delete()
+        pod = None
+
         # Write the manifest to a temporary directory
         job_manifest_path = os.path.join(deploy_dir, "job.yaml")
         utils.write_file(job_crd, job_manifest_path)
@@ -218,10 +231,6 @@ def deploy(self, context):
         # 2. We then need to wait until the job is running or fails
         print("[yellow]Waiting for Job to start... (Timeout: 5 minutes)[/yellow]")
 
-        # Create job objects (and eventually pod)
-        job = objects.KubernetesJob(job_name, namespace)
-        pod = None
-
         # This assumes a backoff / retry of 1, so we aren't doing recreation
         # If it fails once, it fails once and for all.
         # 30 * 5s = 150s (2.5 minutes!)
@@ -245,7 +254,12 @@ def deploy(self, context):
                 )
 
             # 2. If the job isn't terminal, find the pod. It may not exist yet.
-            pod = pod or job.get_pod_name()
+            tries = 0
+            while not pod and tries < 10:
+                print("Waiting for pod...")
+                pod = job.get_pod()
+                time.sleep(5)
+                tries += 1
 
             # 3. If a pod exists, inspect it deeply for fatal errors or readiness.
             if pod:
@@ -320,21 +334,63 @@ def deploy(self, context):
         # But did it succeed?
         if final_status.get("succeeded", 0) > 0:
             print("\n[green]✅ Job final status is Succeeded.[/green]")
+
+            # if we want to optimize, we continue to run until we are instructed not to.
+            if context.get("optimize") is not None:
+
+                # TODO move into own function?
+                # We should provide the cluster resources to the agent
+                resources = self.cluster_resources()
+
+                # The agent calling the optimize agent decides what metadata to present.
+                # This is how this agent will work for cloud vs. bare metal
+                context.requires = prompts.get_optimize_prompt(context, resources)
+                context = self.optimize_agent.run(context, full_logs)
+
+                # Go through spec and update fields that match.
+                decision = context.optimize_result["decision"]
+                print(f"\n[green]✅ Optimization agent decided to {decision}.[/green]")
+                if decision == "RETRY":
+
+                    # Retry will mean recreating job
+                    job.delete()
+                    context.result = self.update_job_crd(context.optimize_result, job_crd)
+                    print(context.result)
+                    return self.deploy(context)
+
+                # Agent has decided to return - no more optimize.
+                # TODO: we need to ensure regex can be passed from context (and input)
+                # Here we add the optimization agent metadata the agent here for saving
+                self.optimize_agent.metadata["foms"] = self.optimize_agent.foms
+                self.metadata["assets"]["optimize"] = self.optimize_agent.metadata
+                return 0, full_logs
+
         else:
             print("\n[red]❌ Job final status is Failed.[/red]")
             diagnostics = self.get_diagnostics(job, pod)
             job.delete()
             # We already have the logs, so we can pass them directly.
             return 1, prompts.failure_message % diagnostics
 
-        if cleanup and os.path.exists(deploy_dir):
+        if context.get("cleanup") is True and os.path.exists(deploy_dir):
             print(f"[dim]Cleaning up temporary deploy directory: {deploy_dir}[/dim]")
             job.delete()
             shutil.rmtree(deploy_dir, ignore_errors=True)
 
         # Save full logs for the step
         return 0, full_logs
 
+    def update_job_crd(self, updates, job_crd):
+        """
+        Update the job crd with a set of controlled fields.
+        """
+        for key in ["decision", "reason"]:
+            if key in updates:
+                del updates[key]
+        prompt = prompts.update_prompt % (job_crd, json.dumps(updates))
+        result = self.ask_gemini(prompt)
+        return self.get_code_block(result, "yaml")
+
     def save_job_manifest(self, job):
         """
         Save job manifest to metadata
 
@@ -1,3 +1,5 @@
+import json
+
 import fractale.agent.defaults as defaults
 from fractale.agent.prompts import prompt_wrapper
 
@@ -29,6 +31,46 @@
 %s
 """
 
+update_prompt = """You are a Kubernetes Job update agent. Your job is to take a spec of updates for a Job Manifest and apply them.
+You are NOT allowed to make other changes to the manifest. Ignore the 'decision' field and if you think appropriate, add context from "reason" as comments.
+Here are the updates:
+
+%s
+
+And here is the Job manifest to apply them to:
+%s
+Return ONLY the YAML with no other text or commentary.
+"""
+
+
+def get_optimize_prompt(context, resources):
+    """
+    Get a description of cluster resources and optimization goals.
+    """
+    prompt = """
+    Your task is to optimize the running of a Kubernetes Job: %s in %s. You are allowed to request anywhere in the range of available resources, including count and type. Here are the available resources:
+    %s
+    Here is the current job manifest:
+    ```yaml
+    %s
+    ```
+    Please return ONLY a json structure to be loaded that includes a limited set of fields (with keys corresponding to the names that are organized the same as a Kubernetes Job, e.g., spec -> template -spec.
+    The result should be provided as json. The fields should map 1:1 into a pod spec serialzied as json.
+    Do not make requests that lead to Guaranteed pods. DO NOT CHANGE PROBLEM SIZE PARAMETERS OR COMMAND. You can change args. Remember that
+    to get a full node resources you often have to ask for slightly less than what is available.
+    """ % (
+        context.optimize,
+        context.environment,
+        json.dumps(resources),
+        context.result,
+    )
+    dockerfile = context.get("dockerfile")
+    if dockerfile:
+        prompt += (
+            f" Here is the Dockerfile that helped to generate the application.\n {dockerfile}\n"
+        )
+    return prompt
+
 
 def get_regenerate_prompt(context):
     """
 
@@ -266,7 +266,7 @@ def get_filtered_status(self):
             ],
         }
 
-    def get_pod_name(self):
+    def get_pod(self):
         """
         Find the name of the pod created by a specific job.
         """
 
@@ -59,18 +59,17 @@ def get_recovery_step(self, context, failed_step, plan):
                 )
         return step
 
-    def save_results(self, tracker):
+    def save_results(self, tracker, plan):
         """
         Save results to file based on timestamp.
-
-        Just ploop into pwd for now, we can eventually take a path.
         """
         if not os.path.exists(self.results_dir):
             os.makedirs(self.results_dir)
         now = datetime.now()
         timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")
         results_file = os.path.join(self.results_dir, f"results-{timestamp}.json")
-        utils.write_json(tracker, results_file)
+        result = {"steps": tracker, "manager": plan.plan}
+        utils.write_json(result, results_file)
 
     @timed
     def run(self, context):
@@ -113,7 +112,7 @@ def run(self, context):
                 f"Agentic tasks complete: [bold magenta]{len(tracker)} agent runs[/bold magenta]",
                 title="[green]Manager Status[/green]",
             )
-            self.save_results(tracker)
+            self.save_results(tracker, plan)
 
         # Raise for now so I can see the issue.
         except Exception as e:
 
@@ -0,0 +1 @@
+from .agent import OptimizationAgent
Original file line number	Diff line number	Diff line change
`@@ -266,7 +266,7 @@ def get_filtered_status(self):`
`266`	`266`	`],`
`267`	`267`	`}`
`268`	`268`
`269`		`- def get_pod_name(self):`
	`269`	`+ def get_pod(self):`
`270`	`270`	`"""`
`271`	`271`	`Find the name of the pod created by a specific job.`
`272`	`272`	`"""`