Merge branch 'main' into ads/cap-757-enhancement-enable-fallback-judge-generator

GangGreenTemperTatum · GangGreenTemperTatum · commit dd87d0a7ff04 · 2026-01-14T13:41:20.000-05:00
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
@@ -33,7 +33,7 @@ jobs:
         uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
 
       - name: Install uv
-        uses: astral-sh/setup-uv@ed21f2f24f8dd64503750218de024bcf64c7250a # v7.1.5
+        uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # v7.2.0
         with:
           version: "latest"
           python-version: ${{ env.PYTHON_VERSION }}
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
@@ -18,7 +18,7 @@ jobs:
         uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
 
       - name: Install uv
-        uses: astral-sh/setup-uv@ed21f2f24f8dd64503750218de024bcf64c7250a # v7.1.5
+        uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # v7.2.0
         with:
           version: "latest"
           python-version: 3.13
diff --git a/.github/workflows/renovate.yaml b/.github/workflows/renovate.yaml
@@ -80,7 +80,7 @@ jobs:
           GITHUB_TOKEN: "${{ steps.app-token.outputs.token }}"
 
       - name: Renovate
-        uses: renovatebot/github-action@5712c6a41dea6cdf32c72d92a763bd417e6606aa # v44.0.5
+        uses: renovatebot/github-action@66387ab8c2464d575b933fa44e9e5a86b2822809 # v44.2.4
         with:
           configurationFile: "${{ env.RENOVATE_ONBOARDING_CONFIG_FILE_NAME }}"
           token: "${{ steps.app-token.outputs.token }}"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -29,7 +29,7 @@ jobs:
       uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
 
     - name: Install uv
-      uses: astral-sh/setup-uv@ed21f2f24f8dd64503750218de024bcf64c7250a # v7.1.5
+      uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # v7.2.0
       with:
         version: "latest"
         python-version: ${{ matrix.python-version }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,12 +18,12 @@ repos:
       - id: trailing-whitespace
 
   - repo: https://github.com/rhysd/actionlint
-    rev: v1.7.9
+    rev: v1.7.10
     hooks:
       - id: actionlint
 
   - repo: https://github.com/adrienverge/yamllint.git
-    rev: v1.37.1
+    rev: v1.38.0
     hooks:
       - id: yamllint
         entry: yamllint --strict -c .hooks/linters/yamllint.yaml
diff --git a/dreadnode/__init__.py b/dreadnode/__init__.py
@@ -38,7 +38,7 @@
 if t.TYPE_CHECKING:
     from dreadnode import agent, airt, eval, optimization, scorers, transforms  # noqa: A004
     from dreadnode.agent import Agent, tool, tool_method
-    from dreadnode.data_types import Audio, Image, Table, Video
+    from dreadnode.data_types import Audio, Image, Message, Table, Video
 
 logger.disable("dreadnode")
 
@@ -88,6 +88,7 @@
     "EnvVar",
     "Image",
     "Markdown",
+    "Message",
     "Metric",
     "MetricDict",
     "Object",
@@ -151,6 +152,7 @@
     "Image": "dreadnode.data_types",
     "Table": "dreadnode.data_types",
     "Video": "dreadnode.data_types",
+    "Message": "dreadnode.data_types",
     "Agent": "dreadnode.agent",
     "tool": "dreadnode.agent",
     "tool_method": "dreadnode.agent",
diff --git a/dreadnode/agent/agent.py b/dreadnode/agent/agent.py
@@ -14,7 +14,7 @@
 from rigging.message import inject_system_content
 from ulid import ULID  # can't access via rg
 
-from dreadnode.agent.error import MaxStepsError
+from dreadnode.agent.error import MaxStepsError, MaxToolCallsError
 from dreadnode.agent.events import (
     AgentEnd,
     AgentError,
@@ -89,7 +89,9 @@ class Agent(Model):
     )
     """The agent's core instructions."""
     max_steps: int = Config(default=10)
-    """The maximum number of steps (generation + tool calls)."""
+    """The maximum number of steps (generations)."""
+    max_tool_calls: int = Config(default=-1)
+    """The maximum number of tool calls. Defaults to infinite."""
     caching: rg.caching.CacheMode | None = Config(default=None, repr=False)
     """How to handle cache_control entries on inference messages."""
 
@@ -488,10 +490,16 @@ async def _dispatch(event: AgentEvent) -> t.AsyncIterator[AgentEvent]:  # noqa:
             raise winning_reaction
 
         # Tool calling
+        tool_calls = 0
 
         async def _process_tool_call(
             tool_call: "rg.tools.ToolCall",
         ) -> t.AsyncGenerator[AgentEvent, None]:
+            nonlocal tool_calls
+
+            if self.max_tool_calls != -1 and tool_calls >= self.max_tool_calls:
+                raise Finish("Reached maximum allowed tool calls.")
+
             async for event in _dispatch(
                 ToolStart(
                     session_id=session_id,
@@ -513,6 +521,7 @@ async def _process_tool_call(
             tool = next((t for t in self.all_tools if t.name == tool_call.name), None)
 
             if tool is not None:
+                tool_calls += 1
                 try:
                     message, stop = await tool.handle_tool_call(tool_call)
                 except Reaction:
@@ -690,6 +699,9 @@ async def _process_tool_call(
         if step >= self.max_steps:
             error = MaxStepsError(max_steps=self.max_steps)
             stop_reason = "max_steps_reached"
+        elif self.max_tool_calls != -1 and tool_calls >= self.max_tool_calls:
+            error = MaxToolCallsError(max_tool_calls=self.max_tool_calls)
+            stop_reason = "max_tool_calls_reached"
         elif error is not None:
             stop_reason = "error"
         elif events and isinstance(events[-1], AgentStalled):
diff --git a/dreadnode/agent/error.py b/dreadnode/agent/error.py
@@ -4,3 +4,11 @@ class MaxStepsError(Exception):
     def __init__(self, max_steps: int):
         super().__init__(f"Maximum steps reached ({max_steps}).")
         self.max_steps = max_steps
+
+
+class MaxToolCallsError(Exception):
+    """Raise from a hook to stop the agent's run due to reaching the maximum number of tool calls."""
+
+    def __init__(self, max_tool_calls: int):
+        super().__init__(f"Maximum tool calls reached ({max_tool_calls}).")
+        self.max_tool_calls = max_tool_calls
diff --git a/dreadnode/agent/result.py b/dreadnode/agent/result.py
@@ -8,7 +8,9 @@
 if t.TYPE_CHECKING:
     from dreadnode.agent.agent import Agent
 
-AgentStopReason = t.Literal["finished", "max_steps_reached", "error", "stalled"]
+AgentStopReason = t.Literal[
+    "finished", "max_steps_reached", "max_tool_calls_reached", "error", "stalled"
+]
 
 
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
diff --git a/dreadnode/agent/thread.py b/dreadnode/agent/thread.py
@@ -1,5 +1,3 @@
-from copy import deepcopy
-
 from pydantic import BaseModel, Field
 from rigging.generator import Usage
 from rigging.message import Message
@@ -44,4 +42,12 @@ def last_usage(self) -> Usage | None:
         return None
 
     def fork(self) -> "Thread":
-        return Thread(messages=deepcopy(self.messages), events=deepcopy(self.events))
+        # Create a new thread with the same messages but empty events
+        # Events are historical tracking and don't need to be forked
+        # We construct new Message objects to avoid shared references
+        forked_messages = []
+        for msg in self.messages:
+            # Reconstruct message from its dict representation
+            msg_dict = msg.model_dump()
+            forked_messages.append(Message.model_validate(msg_dict))
+        return Thread(messages=forked_messages, events=[])
diff --git a/dreadnode/agent/tools/tasking.py b/dreadnode/agent/tools/tasking.py
@@ -1,7 +1,10 @@
+import typing as t
+
 from loguru import logger
+from pydantic import PrivateAttr
 
 from dreadnode.agent.reactions import Fail, Finish
-from dreadnode.agent.tools.base import tool
+from dreadnode.agent.tools.base import Toolset, tool, tool_method
 
 
 @tool
@@ -42,9 +45,9 @@ async def give_up_on_task(reason: str) -> None:
     to complete your assigned process.
 
     ## Best Practices
-    - Do Not Use for a Failed Outcome**: If the `finish_task` tool is available, use it to report failures. \
+    - **Do Not Use for a Failed Outcome**: If the `finish_task` tool is available, use it to report failures. \
     This tool is strictly for when you cannot *finish* your work.
-    - Provide a Clear Justification**: The `reason` must clearly explain why you are stuck. \
+    - **Provide a Clear Justification**: The `reason` must clearly explain why you are stuck. \
     Detail the final obstacle you could not overcome and the approaches you already tried.
 
     Args:
@@ -56,3 +59,46 @@ async def give_up_on_task(reason: str) -> None:
     log_metric("task_give_up", 1)
 
     raise Fail("Agent gave up on the task.")
+
+
+class TaskOutputBuffer(Toolset):
+    """
+    Provides a stateful output buffer for accumulating task results.
+
+    This toolset allows the agent to incrementally build up output across multiple
+    steps and tool calls, storing strings that can be retrieved later.
+    """
+
+    _outputs: list[str] = PrivateAttr(default_factory=list)
+    """Internal buffer storing accumulated output strings."""
+
+    @tool_method(catch=True, variants=["all"])
+    async def add_output(
+        self,
+        content: t.Annotated[str, "The content to add to the output buffer."],
+    ) -> str:
+        """
+        Appends new content to the task's stateful output buffer.
+
+        Use this method when you want to build up a longer or multi-part result
+        over several steps or tool calls, instead of returning everything at once.
+        Each call adds the provided `content` string to the existing buffer;
+        previously saved outputs are never overwritten or cleared by this method.
+
+        The return value is a short confirmation message that includes the current
+        number of saved output entries, which can help you track how much content
+        has been accumulated so far.
+        """
+        self._outputs.append(content)
+        return f"Output saved (total outputs: {len(self._outputs)})"
+
+    @tool_method(catch=True, variants=["all"])
+    async def get_output(self) -> list[str]:
+        """Lists all previously saved outputs in order."""
+        return self._outputs
+
+    @tool_method(catch=True, variants=["all"])
+    async def clear_output(self) -> str:
+        """Clears (deletes) all previously stored output. Warning, any cleared stored output is not recoverable once cleared."""
+        self._outputs = []
+        return "Output buffer cleared."
diff --git a/dreadnode/cli/main.py b/dreadnode/cli/main.py
@@ -35,6 +35,7 @@
 from dreadnode.user_config import ServerConfig, UserConfig
 
 cli = cyclopts.App(
+    name="dreadnode",
     help="Interact with Dreadnode platforms",
     version_flags=[],
     help_on_error=True,
diff --git a/dreadnode/data_types/__init__.py b/dreadnode/data_types/__init__.py
@@ -8,17 +8,30 @@
 if t.TYPE_CHECKING:
     from dreadnode.data_types.audio import Audio
     from dreadnode.data_types.image import Image
+    from dreadnode.data_types.message import Message
     from dreadnode.data_types.table import Table
     from dreadnode.data_types.video import Video
 
-__all__ = ["Audio", "Code", "Image", "Markdown", "Object3D", "Table", "Text", "Video", "WithMeta"]
+__all__ = [
+    "Audio",
+    "Code",
+    "Image",
+    "Markdown",
+    "Message",
+    "Object3D",
+    "Table",
+    "Text",
+    "Video",
+    "WithMeta",
+]
 
 __lazy_submodules__: list[str] = []
 __lazy_components__: dict[str, str] = {
     "Audio": "dreadnode.data_types.audio",
     "Image": "dreadnode.data_types.image",
     "Table": "dreadnode.data_types.table",
     "Video": "dreadnode.data_types.video",
+    "Message": "dreadnode.data_types.message",
 }
 
 
diff --git a/dreadnode/logging_.py b/dreadnode/logging_.py
@@ -4,7 +4,6 @@
 To just enable dreadnode logs to flow, call `logger.enable("dreadnode")` after importing the module.
 """
 
-import os
 import pathlib
 import typing as t
 from textwrap import dedent
@@ -27,10 +26,6 @@
     )
 )
 
-# In vscode jupyter, disable rich's jupyter detection to avoid issues with styling
-if "VSCODE_PID" in os.environ:
-    console.is_jupyter = False
-
 
 def configure_logging(
     log_level: LogLevel = "info",
diff --git a/dreadnode/optimization/collectors.py b/dreadnode/optimization/collectors.py
@@ -26,7 +26,7 @@ def get_parent(trial: Trial[CandidateT]) -> Trial[CandidateT] | None:
     trials: list[Trial[CandidateT]] = []
     parent = get_parent(current_trial)
     while parent:
-        trials.insert(0, parent)
+        trials.append(parent)
         parent = get_parent(parent)
 
     return trials[:depth]
diff --git a/dreadnode/optimization/study.py b/dreadnode/optimization/study.py
@@ -379,17 +379,17 @@ async def _run_evaluation(
         )
         logger.trace(f"Candidate: {trial.candidate!r}")
 
-        if dataset == [{}] or (isinstance(dataset, list) and len(dataset) == 1 and not dataset[0]):
-            # Dataset is empty - this is a Study/Attack where the candidate IS the input
-            dataset = [{"message": trial.candidate}]
-            dataset_input_mapping = ["message"]
-        else:
-            dataset_input_mapping = None
+        # if dataset == [{}] or (isinstance(dataset, list) and len(dataset) == 1 and not dataset[0]):
+        #     # Dataset is empty - this is a Study/Attack where the candidate IS the input
+        #     dataset = [{"message": trial.candidate}]
+        #     dataset_input_mapping = ["message"]
+        # else:
+        #     dataset_input_mapping = None
 
         evaluator = Eval(
             task=task,
             dataset=dataset,
-            dataset_input_mapping=dataset_input_mapping,
+            # dataset_input_mapping=dataset_input_mapping,
             scorers=scorers,
             hooks=self.hooks,
             max_consecutive_errors=self.max_consecutive_errors,
diff --git a/dreadnode/scorers/judge.py b/dreadnode/scorers/judge.py
diff --git a/examples/airt/crescendo_crucible_target.ipynb b/examples/airt/crescendo_crucible_target.ipynb
diff --git a/examples/airt/tap_crucible_target.ipynb b/examples/airt/tap_crucible_target.ipynb
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/test_agent.py b/tests/test_agent.py
diff --git a/uv.lock b/uv.lock