add preview feature to speed things up and reduce token usage

kantord · kantord · commit 47f7ee6bf5aa · 2025-12-03T14:15:41.000+01:00
diff --git a/main.py b/main.py
@@ -58,10 +58,11 @@ async def execute_pipeline(pipeline: list[dict]) -> str:
     A pipeline chains multiple stages where data flows from one to the next:
     - Tool stages: Call external tools (from list_all_tools)
     - Command stages: Transform data with jq, grep, sed, awk, etc.
+    - Preview stages: Inspect data structure before processing (recommended!)
 
     Pipeline Structure:
     Each stage is a dict with:
-    - type: "tool" | "command"
+    - type: "tool" | "command" | "preview"
     - for_each (optional): Process items one-by-one instead of all at once
 
     Tool Stage:
@@ -75,6 +76,27 @@ async def execute_pipeline(pipeline: list[dict]) -> str:
     - Runs whitelisted shell commands (see list_available_shell_commands)
     - Command and args MUST be separate (security requirement)
 
+    Preview Stage:
+    {"type": "preview", "chars": 3000}
+    - Shows a SUMMARIZED view of the data (default: 3000 chars)
+    - ⚠️ OUTPUT IS NOT VALID JSON - uses pseudo-format with /* N more */ markers
+    - Use this to understand data structure BEFORE writing jq filters
+    - Example output:
+      === PREVIEW (not valid JSON, showing structure only) ===
+      {
+        items: [
+          { id: 1, name: "First", data: { /* 3 more */ } },
+          /* 47 more */
+        ]
+      }
+      === END PREVIEW ===
+
+    Example - Preview data before processing (RECOMMENDED first step):
+    [
+        {"type": "tool", "name": "fetch", "server": "api", "args": {"url": "..."}},
+        {"type": "preview", "chars": 2000}
+    ]
+
     Example - Chain tools with data transformation:
     [
         {"type": "tool", "name": "get_data", "server": "database", "args": {"table": "users"}},
@@ -107,6 +129,7 @@ async def execute_pipeline(pipeline: list[dict]) -> str:
       This avoids "unexpected additional properties" errors from automatic merging
 
     Best Practices:
+    - Use preview stages to inspect data BEFORE writing jq filters
     - Build complete workflows as single pipelines (don't split unnecessarily)
     - Check list_all_tools first to see what's available
     - Use get_tool_details(server, tool_name) to see exact tool parameters/schema
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,6 +6,7 @@ readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
     "fastmcp>=2.12.4",
+    "headson>=0.10.0",
     "httpx>=0.27.0",
 ]
 
diff --git a/shell_engine.py b/shell_engine.py
@@ -13,6 +13,8 @@
 from pathlib import Path
 from typing import Any
 
+import headson
+
 
 def _running_in_container() -> bool:
     """Detect if we're running inside a container (Docker, Podman, etc.).
@@ -554,6 +556,44 @@ async def execute_pipeline(self, pipeline: list[dict]) -> str:
                             f"Stage {idx + 1} (tool {server_name}/{tool_name}) failed: {str(e)}"
                         )
 
+                elif item_type == "preview":
+                    # Preview stage: summarize upstream data for the agent to inspect
+                    # Uses headson to create a structure-aware preview within a char budget
+                    # Output is NOT valid JSON - it uses pseudo-format with /* N more */ markers
+                    chars = item.get("chars", 3000)
+
+                    if not isinstance(chars, int) or chars <= 0:
+                        raise ValueError(
+                            f"Preview 'chars' must be a positive integer, got {chars}"
+                        )
+
+                    try:
+                        # Collect upstream data
+                        input_data = "".join(upstream)
+
+                        # Generate preview using headson with detailed style
+                        # detailed style shows /* N more */ markers so agent knows data was truncated
+                        preview = headson.summarize(
+                            input_data,
+                            format="json",
+                            style="detailed",
+                            input_format="json",
+                            byte_budget=chars,  # headson uses byte_budget param
+                        )
+
+                        # Add clear marker that this is a preview, not real data
+                        preview_output = (
+                            "=== PREVIEW (not valid JSON, showing structure only) ===\n"
+                            f"{preview}\n"
+                            "=== END PREVIEW ===\n"
+                        )
+
+                        upstream = iter([preview_output])
+                    except Exception as e:
+                        raise RuntimeError(
+                            f"Stage {idx + 1} (preview) failed: {str(e)}"
+                        )
+
                 else:
                     raise ValueError(f"Unknown pipeline item type: {item_type}")
 
diff --git a/tests/test_shell_engine.py b/tests/test_shell_engine.py
@@ -534,6 +534,92 @@ async def mock_caller(server, tool, args):
         assert "result 3" in result
 
 
+@pytest.mark.asyncio
+class TestPreviewStage:
+    """Test preview stage functionality."""
+
+    async def test_preview_stage_basic(self):
+        """Test that preview stage summarizes JSON data."""
+        large_data = json.dumps({"items": [{"id": i, "name": f"Item {i}"} for i in range(100)]})
+        mock_caller = AsyncMock(return_value=MockToolResult(large_data))
+        engine = ShellEngine(tool_caller=mock_caller)
+
+        pipeline = [
+            {"type": "tool", "name": "get_data", "server": "test", "args": {}},
+            {"type": "preview", "chars": 500},
+        ]
+
+        result = await engine.execute_pipeline(pipeline)
+
+        # Should contain preview markers
+        assert "=== PREVIEW" in result
+        assert "not valid JSON" in result
+        assert "=== END PREVIEW ===" in result
+        # Should show structure but be truncated
+        assert "items" in result
+        # The output should be smaller than the input
+        assert len(result) < len(large_data)
+
+    async def test_preview_stage_shows_omission_markers(self):
+        """Test that preview shows /* N more */ markers for truncated data."""
+        large_array = json.dumps(list(range(1000)))
+        mock_caller = AsyncMock(return_value=MockToolResult(large_array))
+        engine = ShellEngine(tool_caller=mock_caller)
+
+        pipeline = [
+            {"type": "tool", "name": "get_data", "server": "test", "args": {}},
+            {"type": "preview", "chars": 200},
+        ]
+
+        result = await engine.execute_pipeline(pipeline)
+
+        # detailed style should show omission counts
+        assert "more" in result.lower()
+
+    async def test_preview_stage_default_chars(self):
+        """Test that preview stage uses default 3000 chars when not specified."""
+        mock_caller = AsyncMock(return_value=MockToolResult('{"test": "data"}'))
+        engine = ShellEngine(tool_caller=mock_caller)
+
+        pipeline = [
+            {"type": "tool", "name": "get_data", "server": "test", "args": {}},
+            {"type": "preview"},  # No chars specified
+        ]
+
+        # Should not raise, uses default
+        result = await engine.execute_pipeline(pipeline)
+        assert "=== PREVIEW" in result
+
+    async def test_preview_stage_invalid_chars(self):
+        """Test that preview stage rejects invalid chars parameter."""
+        mock_caller = AsyncMock(return_value=MockToolResult('{"test": "data"}'))
+        engine = ShellEngine(tool_caller=mock_caller)
+
+        pipeline = [
+            {"type": "tool", "name": "get_data", "server": "test", "args": {}},
+            {"type": "preview", "chars": -100},
+        ]
+
+        with pytest.raises(RuntimeError, match="chars.*must be a positive integer"):
+            await engine.execute_pipeline(pipeline)
+
+    async def test_preview_stage_in_middle_of_pipeline(self):
+        """Test that preview can be used mid-pipeline (though output won't be valid JSON)."""
+        mock_caller = AsyncMock(return_value=MockToolResult('{"value": 42}'))
+        engine = ShellEngine(tool_caller=mock_caller)
+
+        # Preview in the middle - subsequent stages see preview output, not original data
+        pipeline = [
+            {"type": "tool", "name": "get_data", "server": "test", "args": {}},
+            {"type": "preview", "chars": 500},
+            {"type": "command", "command": "wc", "args": ["-l"]},  # Count lines
+        ]
+
+        result = await engine.execute_pipeline(pipeline)
+        # wc -l should return a number (the line count of the preview)
+        assert result.strip().isdigit() or result.strip().split()[0].isdigit()
+
+
 @pytest.mark.asyncio
 class TestErrorHandling:
     """Test error handling and edge cases."""
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@ readme = "README.md"`
`6`	`6`	`requires-python = ">=3.13"`
`7`	`7`	`dependencies = [`
`8`	`8`	`"fastmcp>=2.12.4",`
	`9`	`+ "headson>=0.10.0",`
`9`	`10`	`"httpx>=0.27.0",`
`10`	`11`	`]`
`11`	`12`