Merge pull request #39 from priyanshujain/feat-spectest

priyanshujain · web-flow · commit a48d4e9e025b · 2026-03-10T16:36:40.000+07:00
Add spectest framework, fix Gemini provider, rewrite system prompts
diff --git a/agent/agent.go b/agent/agent.go
@@ -109,6 +109,7 @@ func (a *Agent) Run(ctx context.Context, input string) (string, error) {
 				Type: provider.ContentToolResult,
 				ToolResult: &provider.ToolResult{
 					ToolUseID: call.ID,
+					Name:      call.Name,
 					Content:   content,
 					IsError:   isError,
 				},
diff --git a/channel/telegram/session.go b/channel/telegram/session.go
@@ -191,11 +191,26 @@ func (sm *SessionManager) newAgent() (*agent.Agent, error) {
 }
 
 func (sm *SessionManager) buildSystemPrompt() string {
-	system := `You are a personal AI assistant powered by OpenBotKit, communicating via Telegram. You help users with email, messaging, notes, and other tasks.
-
-You have core tools available: bash (run commands), file_read, file_write, file_edit, load_skills, search_skills.
-
-To handle domain-specific tasks (email, WhatsApp, notes, etc.), first use search_skills to find relevant skills, then use load_skills to get detailed instructions.
+	system := `You are a personal AI assistant powered by OpenBotKit, communicating via Telegram.
+
+## Tools
+Available: bash, file_read, file_write, file_edit, load_skills, search_skills.
+Tool names are case-sensitive. Call tools exactly as listed.
+
+Rules:
+- ALWAYS use tools to perform actions. Never say you will do something without calling the tool.
+- Never predict or claim results before receiving them. Wait for tool output.
+- Do not narrate routine tool calls — just call the tool. Only explain when the step is non-obvious or the user asked for details.
+- If a tool call fails, analyze the error before retrying with a different approach.
+- Be concise and direct. Skip filler phrases.
+
+## Skills
+Before replying to domain-specific requests (email, WhatsApp, memories, notes, etc.):
+1. Scan the "Available skills" list below for matching skill names
+2. Use load_skills to read the skill's instructions
+3. Use bash to run the commands from those instructions
+4. If the request spans multiple domains, load and use ALL relevant skills
+5. If no skill matches, use search_skills to discover one by keyword
 `
 
 	idx, err := skills.LoadIndex()
diff --git a/internal/cli/chat.go b/internal/cli/chat.go
@@ -149,14 +149,27 @@ func generateSessionID() string {
 }
 
 func buildSystemPrompt() string {
-	system := `You are a personal AI assistant powered by OpenBotKit. You help users with email, messaging, notes, and other tasks.
-
-You have core tools available: bash (run commands), file_read, file_write, file_edit, load_skills, search_skills.
-
-To handle domain-specific tasks (email, WhatsApp, notes, etc.), first use search_skills to find relevant skills, then use load_skills to get detailed instructions. Skills teach you how to use bash and sqlite3 for specific domains.
+	system := `You are a personal AI assistant powered by OpenBotKit.
+
+## Tools
+Available: bash, file_read, file_write, file_edit, load_skills, search_skills.
+Tool names are case-sensitive. Call tools exactly as listed.
+
+Rules:
+- ALWAYS use tools to perform actions. Never say you will do something without calling the tool.
+- Never predict or claim results before receiving them. Wait for tool output.
+- Do not narrate routine tool calls — just call the tool. Only explain when the step is non-obvious or the user asked for details.
+- If a tool call fails, analyze the error before retrying with a different approach.
+
+## Skills
+Before replying to domain-specific requests (email, WhatsApp, memories, notes, etc.):
+1. Scan the "Available skills" list below for matching skill names
+2. Use load_skills to read the skill's instructions
+3. Use bash to run the commands from those instructions
+4. If the request spans multiple domains, load and use ALL relevant skills
+5. If no skill matches, use search_skills to discover one by keyword
 `
 
-	// Append skill index if available.
 	idx, err := skills.LoadIndex()
 	if err == nil && len(idx.Skills) > 0 {
 		system += "\nAvailable skills:\n"
diff --git a/provider/gemini/gemini.go b/provider/gemini/gemini.go
@@ -219,16 +219,21 @@ func convertMessage(m provider.Message) []map[string]any {
 		role = "model"
 	}
 
-	var parts []map[string]any
+	// Check if this message contains tool results — Gemini requires
+	// functionResponse parts in a separate "user" content with all
+	// results grouped together.
+	var funcResponseParts []map[string]any
+	var otherParts []map[string]any
+
 	for _, block := range m.Content {
 		switch block.Type {
 		case provider.ContentText:
-			parts = append(parts, map[string]any{"text": block.Text})
+			otherParts = append(otherParts, map[string]any{"text": block.Text})
 		case provider.ContentToolUse:
 			if block.ToolCall != nil {
 				var args map[string]any
 				_ = json.Unmarshal(block.ToolCall.Input, &args)
-				parts = append(parts, map[string]any{
+				otherParts = append(otherParts, map[string]any{
 					"functionCall": map[string]any{
 						"name": block.ToolCall.Name,
 						"args": args,
@@ -237,28 +242,46 @@ func convertMessage(m provider.Message) []map[string]any {
 			}
 		case provider.ContentToolResult:
 			if block.ToolResult != nil {
-				// Gemini expects functionResponse in a separate "user" content.
 				var response map[string]any
 				if err := json.Unmarshal([]byte(block.ToolResult.Content), &response); err != nil {
 					response = map[string]any{"result": block.ToolResult.Content}
 				}
-				return []map[string]any{{
-					"role": "user",
-					"parts": []map[string]any{{
-						"functionResponse": map[string]any{
-							"name":     block.ToolResult.ToolUseID,
-							"response": response,
-						},
-					}},
-				}}
+				// Gemini matches functionResponse by function name, not by call ID.
+				name := block.ToolResult.Name
+				if name == "" {
+					name = block.ToolResult.ToolUseID
+				}
+				funcResponseParts = append(funcResponseParts, map[string]any{
+					"functionResponse": map[string]any{
+						"name":     name,
+						"response": response,
+					},
+				})
 			}
 		}
 	}
 
-	return []map[string]any{{
-		"role":  role,
-		"parts": parts,
-	}}
+	var result []map[string]any
+	if len(otherParts) > 0 {
+		result = append(result, map[string]any{
+			"role":  role,
+			"parts": otherParts,
+		})
+	}
+	if len(funcResponseParts) > 0 {
+		result = append(result, map[string]any{
+			"role":  "user",
+			"parts": funcResponseParts,
+		})
+	}
+	if len(result) == 0 {
+		result = append(result, map[string]any{
+			"role":  role,
+			"parts": []map[string]any{},
+		})
+	}
+
+	return result
 }
 
 func (g *Gemini) doRequest(ctx context.Context, url string, body map[string]any) (io.ReadCloser, error) {
diff --git a/provider/types.go b/provider/types.go
@@ -62,6 +62,7 @@ type ToolCall struct {
 // ToolResult represents the output of a tool invocation.
 type ToolResult struct {
 	ToolUseID string `json:"tool_use_id"`
+	Name      string `json:"name"`
 	Content   string `json:"content"`
 	IsError   bool   `json:"is_error,omitempty"`
 }
diff --git a/skills/memory-save/SKILL.md b/skills/memory-save/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: memory-save
-description: Save a personal fact about the user to memory when they ask you to remember something
+description: Save and recall personal facts about the user (memories, preferences, relationships)
 allowed-tools: Bash(obk *)
 ---
 
diff --git a/spectest/assert.go b/spectest/assert.go
@@ -0,0 +1,23 @@
+package spectest
+
+import (
+	"strings"
+	"testing"
+)
+
+func AssertNotEmpty(t *testing.T, response string) {
+	t.Helper()
+	if strings.TrimSpace(response) == "" {
+		t.Fatal("expected non-empty response")
+	}
+}
+
+func AssertContains(t *testing.T, response string, substrings ...string) {
+	t.Helper()
+	lower := strings.ToLower(response)
+	for _, s := range substrings {
+		if !strings.Contains(lower, strings.ToLower(s)) {
+			t.Errorf("expected response to contain %q, got:\n%s", s, response)
+		}
+	}
+}
diff --git a/spectest/cross_source_test.go b/spectest/cross_source_test.go
@@ -0,0 +1,78 @@
+package spectest
+
+import (
+	"context"
+	"testing"
+	"time"
+)
+
+// TestSpec_SummarizeCommunicationsAcrossSources seeds emails and WhatsApp
+// messages from the same person, then asks the agent to summarize all
+// communications. The agent must autonomously discover and use both email-read
+// and whatsapp-read skills, query both databases, and synthesize the results
+// in a single turn.
+func TestSpec_SummarizeCommunicationsAcrossSources(t *testing.T) {
+	EachProvider(t, func(t *testing.T, fx *LocalFixture) {
+		fx.GivenEmails(t, []Email{
+			{From: "alice@acme.com", To: "me@example.com", Subject: "Q3 Budget Review", Body: "Hi, please review the Q3 budget spreadsheet I shared. We need to finalize numbers by Friday."},
+			{From: "alice@acme.com", To: "me@example.com", Subject: "Team Offsite in Portland", Body: "I'm thinking we do the offsite in Portland in October. Thoughts?"},
+		})
+
+		fx.GivenWhatsAppMessages(t, []WhatsAppMessage{
+			{SenderJID: "alice@s.whatsapp.net", SenderName: "Alice", ChatJID: "alice@s.whatsapp.net", ChatName: "Alice", Text: "Booked Trattoria Vecchia for Friday dinner, confirmation code TRV-8842."},
+			{SenderJID: "alice@s.whatsapp.net", SenderName: "Alice", ChatJID: "alice@s.whatsapp.net", ChatName: "Alice", Text: "Can you bring the Nakamura prototype to the offsite? Serial number NK-2047."},
+		})
+
+		a := fx.Agent(t)
+		ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
+		defer cancel()
+
+		prompt := "Summarize all communications from Alice across both email and WhatsApp."
+		result, err := a.Run(ctx, prompt)
+		if err != nil {
+			t.Fatalf("agent.Run: %v", err)
+		}
+
+		AssertNotEmpty(t, result)
+		AssertJudge(t, fx.Provider, fx.Model, prompt, result,
+			"The response must cover topics from BOTH email and WhatsApp. "+
+				"It should mention the Q3 budget review or Portland offsite from email, AND reference "+
+				"Trattoria Vecchia, TRV-8842, Nakamura prototype, or NK-2047 from WhatsApp. "+
+				"It should not claim that data from one source is missing if it was provided.")
+	})
+}
+
+// TestSpec_RecallMemoryAndCorrelateEmails seeds personal memories about a
+// relationship and emails with project details. The agent must autonomously
+// check memories for context about the person, search emails for specifics,
+// and combine both into a coherent answer in a single turn.
+func TestSpec_RecallMemoryAndCorrelateEmails(t *testing.T) {
+	EachProvider(t, func(t *testing.T, fx *LocalFixture) {
+		fx.GivenMemories(t, []UserMemory{
+			{Content: "Raj Patel is my tech lead at Zephyr Industries", Category: "relationship"},
+			{Content: "Project Firebird has a hard deadline of June 15, 2025", Category: "project"},
+		})
+
+		fx.GivenEmails(t, []Email{
+			{From: "raj.patel@zephyr.io", To: "me@example.com", Subject: "Project Firebird Sprint 7 Retro", Body: "Sprint 7 retro is scheduled for May 22. Please prepare your notes on the auth module refactor."},
+			{From: "raj.patel@zephyr.io", To: "me@example.com", Subject: "Project Firebird Launch Prep", Body: "Client confirmed the staging demo for June 10. We need all QA passed by June 8."},
+		})
+
+		a := fx.Agent(t)
+		ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
+		defer cancel()
+
+		prompt := "Tell me everything about Raj and Project Firebird. Check both my memories and emails."
+		result, err := a.Run(ctx, prompt)
+		if err != nil {
+			t.Fatalf("agent.Run: %v", err)
+		}
+
+		AssertNotEmpty(t, result)
+		AssertJudge(t, fx.Provider, fx.Model, prompt, result,
+			"The response must include information from BOTH memories and emails. "+
+				"It should mention Raj Patel is the tech lead at Zephyr Industries (from memory) AND mention "+
+				"email subjects or content about Project Firebird Sprint 7 and Launch Prep (from emails). "+
+				"It should not only use one source.")
+	})
+}
diff --git a/spectest/email_query_test.go b/spectest/email_query_test.go
@@ -0,0 +1,32 @@
+package spectest
+
+import (
+	"context"
+	"testing"
+	"time"
+)
+
+func TestSpec_FindEmailsBySender(t *testing.T) {
+	EachProvider(t, func(t *testing.T, fx *LocalFixture) {
+		fx.GivenEmails(t, []Email{
+			{From: "alice@example.com", Subject: "Meeting Tomorrow", Body: "Let's meet at 2pm"},
+			{From: "bob@example.com", Subject: "Project Update", Body: "Here is the latest"},
+			{From: "alice@example.com", Subject: "Lunch Plans", Body: "Friday lunch?"},
+		})
+
+		a := fx.Agent(t)
+		ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
+		defer cancel()
+
+		prompt := "Find emails from Alice"
+		result, err := a.Run(ctx, prompt)
+		if err != nil {
+			t.Fatalf("agent.Run: %v", err)
+		}
+
+		AssertNotEmpty(t, result)
+		AssertJudge(t, fx.Provider, fx.Model, prompt, result,
+			"The response must list Alice's emails. It should mention both 'Meeting Tomorrow' and 'Lunch Plans' subjects. "+
+				"It should NOT include Bob's 'Project Update' email.")
+	})
+}
diff --git a/spectest/fixture.go b/spectest/fixture.go
@@ -0,0 +1,38 @@
+package spectest
+
+import (
+	"testing"
+
+	"github.com/priyanshujain/openbotkit/agent"
+)
+
+type Email struct {
+	MessageID string
+	Account   string
+	From      string
+	To        string
+	Subject   string
+	Body      string
+}
+
+type WhatsAppMessage struct {
+	MessageID  string
+	ChatJID    string
+	ChatName   string
+	SenderJID  string
+	SenderName string
+	Text       string
+	IsFromMe   bool
+}
+
+type UserMemory struct {
+	Content  string
+	Category string // identity, preference, relationship, project
+}
+
+type Fixture interface {
+	Agent(t *testing.T) *agent.Agent
+	GivenEmails(t *testing.T, emails []Email)
+	GivenWhatsAppMessages(t *testing.T, messages []WhatsAppMessage)
+	GivenMemories(t *testing.T, memories []UserMemory)
+}
diff --git a/spectest/judge.go b/spectest/judge.go
diff --git a/spectest/local_fixture.go b/spectest/local_fixture.go

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,7 @@ type ToolCall struct {`
`62`	`62`	`// ToolResult represents the output of a tool invocation.`
`63`	`63`	`type ToolResult struct {`
`64`	`64`	ToolUseID string `json:"tool_use_id"`
	`65`	+ Name string `json:"name"`
`65`	`66`	Content string `json:"content"`
`66`	`67`	IsError bool `json:"is_error,omitempty"`
`67`	`68`	`}`