perf: optimize LLM orchestration with one-turn resolution and parallel prefetching

Raezil · Raezil · commit 400063543dff · 2026-02-27T14:00:13.000+01:00
diff --git a/agent.go b/agent.go
@@ -1150,6 +1150,23 @@ func (a *Agent) Generate(ctx context.Context, sessionID, userInput string) (any,
 		return "", errors.New("user input is empty")
 	}
 
+	// -------------------------------------------------------------
+	// PREFETCH: Start context retrieval and tool discovery in parallel
+	// -------------------------------------------------------------
+	var (
+		prefetchWG sync.WaitGroup
+		records    []memory.MemoryRecord
+	)
+
+	prefetchWG.Add(1)
+	go func() {
+		defer prefetchWG.Done()
+		records, _ = a.retrieveContext(ctx, sessionID, userInput, a.contextLimit)
+	}()
+
+	// ToolSpecs discovery is internally cached and thread-safe.
+	_ = a.ToolSpecs()
+
 	// ---------------------------------------------
 	// 0. DIRECT TOOL INVOCATION (bypass everything)
 	// ---------------------------------------------
@@ -1202,7 +1219,8 @@ func (a *Agent) Generate(ctx context.Context, sessionID, userInput string) (any,
 	// ---------------------------------------------
 	// 4. TOOL ORCHESTRATOR (normal UTCP tools)
 	// ---------------------------------------------
-	if handled, output, err := a.toolOrchestrator(ctx, sessionID, userInput); handled {
+	prefetchWG.Wait() // Ensure memory is ready for orchestrator
+	if handled, output, err := a.toolOrchestrator(ctx, sessionID, userInput, records); handled {
 		if err != nil {
 			return "", err
 		}
@@ -1225,14 +1243,24 @@ func (a *Agent) Generate(ctx context.Context, sessionID, userInput string) (any,
 	// ---------------------------------------------
 	// 6. LLM COMPLETION
 	// ---------------------------------------------
-	prompt, err := a.buildPrompt(ctx, sessionID, userInput)
-	if err != nil {
-		return "", err
-	}
+	// Build LLM prompt without tools/subagents:
+	var sb strings.Builder
+	sb.Grow(4096)
+
+	sb.WriteString(a.systemPrompt)
+	sb.WriteString("\n\nConversation memory (TOON):\n")
+	sb.WriteString(a.renderMemory(records))
+
+	sb.WriteString("\n\nUser: ")
+	sb.WriteString(sanitizeInput(userInput))
+	sb.WriteString("\n\n")
+
+	prompt := sb.String()
 
 	files, _ := a.RetrieveAttachmentFiles(ctx, sessionID, a.contextLimit)
 
 	var completion any
+	var err error
 	if len(files) > 0 {
 		completion, err = a.model.GenerateWithFiles(ctx, prompt, files)
 	} else {
@@ -1429,6 +1457,7 @@ type ToolChoice struct {
 	ToolName  string         `json:"tool_name"`
 	Arguments map[string]any `json:"arguments"`
 	Reason    string         `json:"reason"`
+	Answer    string         `json:"answer"` // Added for one-turn resolution
 }
 
 // In the toolOrchestrator function, modify the JSON parsing section:
@@ -1437,6 +1466,7 @@ func (a *Agent) toolOrchestrator(
 	ctx context.Context,
 	sessionID string,
 	userInput string,
+	records []memory.MemoryRecord,
 ) (bool, string, error) {
 
 	// FAST PATH: Skip LLM call for obvious non-tool queries
@@ -1476,31 +1506,40 @@ func (a *Agent) toolOrchestrator(
 		})
 	}
 
-	// Build tool selection prompt
+	// Build tool selection prompt with memory context
 	toolDesc := a.cachedToolPrompt(toolList)
+	memoryDesc := a.renderMemory(records)
 
 	choicePrompt := fmt.Sprintf(`
-You are a UTCP tool selection engine.
+You are a UTCP tool selection and planning engine.
 
-A user asked:
+USER REQUEST:
 %q
 
-You have access to these UTCP tools:
+CONVERSATION MEMORY:
 %s
 
-Think step-by-step whether ANY tool should be used.
+AVAILABLE UTCP TOOLS:
+%s
+
+OBJECTIVE:
+Analyze if the user's request requires calling a tool or if it can be answered directly using conversational memory.
 
-Return ONLY a JSON object EXACTLY like this:
+RULES:
+1. If a tool is needed, set "use_tool": true and provide "tool_name" and "arguments".
+2. If NO tool is needed, set "use_tool": false and provide the final answer in "answer".
+3. Use only the exact tool names provided.
 
+Return ONLY a JSON object:
 {
   "use_tool": true|false,
   "tool_name": "name or empty",
   "arguments": { },
-  "stream": true|false
+  "answer": "Complete final answer if no tool is used",
+  "reason": "Short explanation"
 }
 
-Return ONLY JSON. No explanations.
-`, userInput, toolDesc)
+Return ONLY JSON.`, userInput, memoryDesc, toolDesc)
 
 	// Query LLM
 	raw, err := a.model.Generate(ctx, choicePrompt)
@@ -1522,6 +1561,10 @@ Return ONLY JSON. No explanations.
 	}
 
 	if !tc.UseTool {
+		if tc.Answer != "" {
+			a.storeMemory(sessionID, "assistant", tc.Answer, nil)
+			return true, tc.Answer, nil
+		}
 		return false, "", nil
 	}
 	if strings.TrimSpace(tc.ToolName) == "" {
@@ -1689,16 +1732,25 @@ func extractJSON(response string) string {
 // This AVOIDS expensive LLM calls for obvious non-tool queries.
 // EXTREMELY CONSERVATIVE: only filters pure informational questions.
 func (a *Agent) likelyNeedsToolCall(lowerInput string) bool {
-	// ONLY filter out EXPLICIT pure informational questions
-	// Examples: "what is X?", "explain Y", "why does Z"
+	// 0. Skip for very short inputs or greetings
+	if len(lowerInput) < 2 {
+		return false
+	}
+	greetings := []string{"hello", "hi", "hey", "good morning", "good afternoon", "thanks", "thank you"}
+	for _, g := range greetings {
+		if lowerInput == g || strings.HasPrefix(lowerInput, g+" ") || strings.HasPrefix(lowerInput, g+",") {
+			return false
+		}
+	}
 
-	// Check for pure question patterns WITHOUT any action words
+	// 1. Check for pure informational question patterns WITHOUT any action words
 	pureQuestionStarters := []string{
 		"what is ", "what are ", "what does ", "what's ",
 		"why is ", "why are ", "why does ", "why do ",
 		"who is ", "who are ", "who was ",
 		"when is ", "when was ", "when did ",
 		"where is ", "where are ", "where was ",
+		"how is ", "how are ", "how does ",
 		"explain ", "describe ", "define ",
 		"tell me about ", "tell me what ",
 	}
@@ -1711,7 +1763,9 @@ func (a *Agent) likelyNeedsToolCall(lowerInput string) bool {
 				strings.Contains(lowerInput, " get") ||
 				strings.Contains(lowerInput, " list") ||
 				strings.Contains(lowerInput, " show") ||
-				strings.Contains(lowerInput, " files")
+				strings.Contains(lowerInput, " files") ||
+				strings.Contains(lowerInput, " run") ||
+				strings.Contains(lowerInput, " exec")
 
 			if !hasActionWord {
 				// Pure informational question - skip tool orchestration
diff --git a/agent_stream.go b/agent_stream.go
@@ -5,7 +5,9 @@ import (
 	"errors"
 	"fmt"
 	"strings"
+	"sync"
 
+	"github.com/Protocol-Lattice/go-agent/src/memory"
 	"github.com/Protocol-Lattice/go-agent/src/models"
 )
 
@@ -17,6 +19,23 @@ func (a *Agent) GenerateStream(ctx context.Context, sessionID, userInput string)
 		return nil, errors.New("user input is empty")
 	}
 
+	// -------------------------------------------------------------
+	// PREFETCH: Start context retrieval and tool discovery in parallel
+	// -------------------------------------------------------------
+	var (
+		prefetchWG sync.WaitGroup
+		records    []memory.MemoryRecord
+	)
+
+	prefetchWG.Add(1)
+	go func() {
+		defer prefetchWG.Done()
+		records, _ = a.retrieveContext(ctx, sessionID, userInput, a.contextLimit)
+	}()
+
+	// ToolSpecs discovery is internally cached and thread-safe.
+	_ = a.ToolSpecs()
+
 	// Helper to wrap immediate result in a stream
 	immediateStream := func(val any, err error) (<-chan models.StreamChunk, error) {
 		ch := make(chan models.StreamChunk, 1)
@@ -58,7 +77,8 @@ func (a *Agent) GenerateStream(ctx context.Context, sessionID, userInput string)
 	}
 
 	// 4. TOOL ORCHESTRATOR
-	if handled, output, err := a.toolOrchestrator(ctx, sessionID, userInput); handled {
+	prefetchWG.Wait()
+	if handled, output, err := a.toolOrchestrator(ctx, sessionID, userInput, records); handled {
 		return immediateStream(output, err)
 	}
 
@@ -71,13 +91,17 @@ func (a *Agent) GenerateStream(ctx context.Context, sessionID, userInput string)
 	}
 
 	// 6. LLM COMPLETION (Streaming)
-	prompt, err := a.buildPrompt(ctx, sessionID, userInput)
-	if err != nil {
-		return nil, err
-	}
-
-	// Note: Currently GenerateStream does not support file attachments for streaming.
-	// We proceed with text-only streaming.
+	// Build prompt manually to use pre-fetched records
+	var sb strings.Builder
+	sb.Grow(4096)
+	sb.WriteString(a.systemPrompt)
+	sb.WriteString("\n\nConversation memory (TOON):\n")
+	sb.WriteString(a.renderMemory(records))
+	sb.WriteString("\n\nUser: ")
+	sb.WriteString(sanitizeInput(userInput))
+	sb.WriteString("\n\n")
+
+	prompt := sb.String()
 
 	stream, err := a.model.GenerateStream(ctx, prompt)
 	if err != nil {
diff --git a/agent_test.go b/agent_test.go
@@ -799,7 +799,7 @@ func TestGenerate_ExecutesUTCPCalledTool(t *testing.T) {
 		t.Fatalf("New returned error: %v", err)
 	}
 
-	out, err := agent.Generate(ctx, "s1", "hello")
+	out, err := agent.Generate(ctx, "s1", "echo something")
 	if err != nil {
 		t.Fatalf("Generate returned error: %v", err)
 	}

Original file line number	Diff line number	Diff line change
`@@ -799,7 +799,7 @@ func TestGenerate_ExecutesUTCPCalledTool(t *testing.T) {`
`799`	`799`	`t.Fatalf("New returned error: %v", err)`
`800`	`800`	`}`
`801`	`801`
`802`		`- out, err := agent.Generate(ctx, "s1", "hello")`
	`802`	`+ out, err := agent.Generate(ctx, "s1", "echo something")`
`803`	`803`	`if err != nil {`
`804`	`804`	`t.Fatalf("Generate returned error: %v", err)`
`805`	`805`	`}`