Skip to content

Commit a48d4e9

Browse files
Merge pull request #39 from priyanshujain/feat-spectest
Add spectest framework, fix Gemini provider, rewrite system prompts
2 parents ea9cab2 + 338429a commit a48d4e9

File tree

12 files changed

+709
-29
lines changed

12 files changed

+709
-29
lines changed

agent/agent.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ func (a *Agent) Run(ctx context.Context, input string) (string, error) {
109109
Type: provider.ContentToolResult,
110110
ToolResult: &provider.ToolResult{
111111
ToolUseID: call.ID,
112+
Name: call.Name,
112113
Content: content,
113114
IsError: isError,
114115
},

channel/telegram/session.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -191,11 +191,26 @@ func (sm *SessionManager) newAgent() (*agent.Agent, error) {
191191
}
192192

193193
func (sm *SessionManager) buildSystemPrompt() string {
194-
system := `You are a personal AI assistant powered by OpenBotKit, communicating via Telegram. You help users with email, messaging, notes, and other tasks.
195-
196-
You have core tools available: bash (run commands), file_read, file_write, file_edit, load_skills, search_skills.
197-
198-
To handle domain-specific tasks (email, WhatsApp, notes, etc.), first use search_skills to find relevant skills, then use load_skills to get detailed instructions.
194+
system := `You are a personal AI assistant powered by OpenBotKit, communicating via Telegram.
195+
196+
## Tools
197+
Available: bash, file_read, file_write, file_edit, load_skills, search_skills.
198+
Tool names are case-sensitive. Call tools exactly as listed.
199+
200+
Rules:
201+
- ALWAYS use tools to perform actions. Never say you will do something without calling the tool.
202+
- Never predict or claim results before receiving them. Wait for tool output.
203+
- Do not narrate routine tool calls — just call the tool. Only explain when the step is non-obvious or the user asked for details.
204+
- If a tool call fails, analyze the error before retrying with a different approach.
205+
- Be concise and direct. Skip filler phrases.
206+
207+
## Skills
208+
Before replying to domain-specific requests (email, WhatsApp, memories, notes, etc.):
209+
1. Scan the "Available skills" list below for matching skill names
210+
2. Use load_skills to read the skill's instructions
211+
3. Use bash to run the commands from those instructions
212+
4. If the request spans multiple domains, load and use ALL relevant skills
213+
5. If no skill matches, use search_skills to discover one by keyword
199214
`
200215

201216
idx, err := skills.LoadIndex()

internal/cli/chat.go

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,14 +149,27 @@ func generateSessionID() string {
149149
}
150150

151151
func buildSystemPrompt() string {
152-
system := `You are a personal AI assistant powered by OpenBotKit. You help users with email, messaging, notes, and other tasks.
153-
154-
You have core tools available: bash (run commands), file_read, file_write, file_edit, load_skills, search_skills.
155-
156-
To handle domain-specific tasks (email, WhatsApp, notes, etc.), first use search_skills to find relevant skills, then use load_skills to get detailed instructions. Skills teach you how to use bash and sqlite3 for specific domains.
152+
system := `You are a personal AI assistant powered by OpenBotKit.
153+
154+
## Tools
155+
Available: bash, file_read, file_write, file_edit, load_skills, search_skills.
156+
Tool names are case-sensitive. Call tools exactly as listed.
157+
158+
Rules:
159+
- ALWAYS use tools to perform actions. Never say you will do something without calling the tool.
160+
- Never predict or claim results before receiving them. Wait for tool output.
161+
- Do not narrate routine tool calls — just call the tool. Only explain when the step is non-obvious or the user asked for details.
162+
- If a tool call fails, analyze the error before retrying with a different approach.
163+
164+
## Skills
165+
Before replying to domain-specific requests (email, WhatsApp, memories, notes, etc.):
166+
1. Scan the "Available skills" list below for matching skill names
167+
2. Use load_skills to read the skill's instructions
168+
3. Use bash to run the commands from those instructions
169+
4. If the request spans multiple domains, load and use ALL relevant skills
170+
5. If no skill matches, use search_skills to discover one by keyword
157171
`
158172

159-
// Append skill index if available.
160173
idx, err := skills.LoadIndex()
161174
if err == nil && len(idx.Skills) > 0 {
162175
system += "\nAvailable skills:\n"

provider/gemini/gemini.go

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -219,16 +219,21 @@ func convertMessage(m provider.Message) []map[string]any {
219219
role = "model"
220220
}
221221

222-
var parts []map[string]any
222+
// Check if this message contains tool results — Gemini requires
223+
// functionResponse parts in a separate "user" content with all
224+
// results grouped together.
225+
var funcResponseParts []map[string]any
226+
var otherParts []map[string]any
227+
223228
for _, block := range m.Content {
224229
switch block.Type {
225230
case provider.ContentText:
226-
parts = append(parts, map[string]any{"text": block.Text})
231+
otherParts = append(otherParts, map[string]any{"text": block.Text})
227232
case provider.ContentToolUse:
228233
if block.ToolCall != nil {
229234
var args map[string]any
230235
_ = json.Unmarshal(block.ToolCall.Input, &args)
231-
parts = append(parts, map[string]any{
236+
otherParts = append(otherParts, map[string]any{
232237
"functionCall": map[string]any{
233238
"name": block.ToolCall.Name,
234239
"args": args,
@@ -237,28 +242,46 @@ func convertMessage(m provider.Message) []map[string]any {
237242
}
238243
case provider.ContentToolResult:
239244
if block.ToolResult != nil {
240-
// Gemini expects functionResponse in a separate "user" content.
241245
var response map[string]any
242246
if err := json.Unmarshal([]byte(block.ToolResult.Content), &response); err != nil {
243247
response = map[string]any{"result": block.ToolResult.Content}
244248
}
245-
return []map[string]any{{
246-
"role": "user",
247-
"parts": []map[string]any{{
248-
"functionResponse": map[string]any{
249-
"name": block.ToolResult.ToolUseID,
250-
"response": response,
251-
},
252-
}},
253-
}}
249+
// Gemini matches functionResponse by function name, not by call ID.
250+
name := block.ToolResult.Name
251+
if name == "" {
252+
name = block.ToolResult.ToolUseID
253+
}
254+
funcResponseParts = append(funcResponseParts, map[string]any{
255+
"functionResponse": map[string]any{
256+
"name": name,
257+
"response": response,
258+
},
259+
})
254260
}
255261
}
256262
}
257263

258-
return []map[string]any{{
259-
"role": role,
260-
"parts": parts,
261-
}}
264+
var result []map[string]any
265+
if len(otherParts) > 0 {
266+
result = append(result, map[string]any{
267+
"role": role,
268+
"parts": otherParts,
269+
})
270+
}
271+
if len(funcResponseParts) > 0 {
272+
result = append(result, map[string]any{
273+
"role": "user",
274+
"parts": funcResponseParts,
275+
})
276+
}
277+
if len(result) == 0 {
278+
result = append(result, map[string]any{
279+
"role": role,
280+
"parts": []map[string]any{},
281+
})
282+
}
283+
284+
return result
262285
}
263286

264287
func (g *Gemini) doRequest(ctx context.Context, url string, body map[string]any) (io.ReadCloser, error) {

provider/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ type ToolCall struct {
6262
// ToolResult represents the output of a tool invocation.
6363
type ToolResult struct {
6464
ToolUseID string `json:"tool_use_id"`
65+
Name string `json:"name"`
6566
Content string `json:"content"`
6667
IsError bool `json:"is_error,omitempty"`
6768
}

skills/memory-save/SKILL.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
name: memory-save
3-
description: Save a personal fact about the user to memory when they ask you to remember something
3+
description: Save and recall personal facts about the user (memories, preferences, relationships)
44
allowed-tools: Bash(obk *)
55
---
66

spectest/assert.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package spectest
2+
3+
import (
4+
"strings"
5+
"testing"
6+
)
7+
8+
func AssertNotEmpty(t *testing.T, response string) {
9+
t.Helper()
10+
if strings.TrimSpace(response) == "" {
11+
t.Fatal("expected non-empty response")
12+
}
13+
}
14+
15+
func AssertContains(t *testing.T, response string, substrings ...string) {
16+
t.Helper()
17+
lower := strings.ToLower(response)
18+
for _, s := range substrings {
19+
if !strings.Contains(lower, strings.ToLower(s)) {
20+
t.Errorf("expected response to contain %q, got:\n%s", s, response)
21+
}
22+
}
23+
}

spectest/cross_source_test.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
package spectest
2+
3+
import (
4+
"context"
5+
"testing"
6+
"time"
7+
)
8+
9+
// TestSpec_SummarizeCommunicationsAcrossSources seeds emails and WhatsApp
10+
// messages from the same person, then asks the agent to summarize all
11+
// communications. The agent must autonomously discover and use both email-read
12+
// and whatsapp-read skills, query both databases, and synthesize the results
13+
// in a single turn.
14+
func TestSpec_SummarizeCommunicationsAcrossSources(t *testing.T) {
15+
EachProvider(t, func(t *testing.T, fx *LocalFixture) {
16+
fx.GivenEmails(t, []Email{
17+
{From: "alice@acme.com", To: "me@example.com", Subject: "Q3 Budget Review", Body: "Hi, please review the Q3 budget spreadsheet I shared. We need to finalize numbers by Friday."},
18+
{From: "alice@acme.com", To: "me@example.com", Subject: "Team Offsite in Portland", Body: "I'm thinking we do the offsite in Portland in October. Thoughts?"},
19+
})
20+
21+
fx.GivenWhatsAppMessages(t, []WhatsAppMessage{
22+
{SenderJID: "alice@s.whatsapp.net", SenderName: "Alice", ChatJID: "alice@s.whatsapp.net", ChatName: "Alice", Text: "Booked Trattoria Vecchia for Friday dinner, confirmation code TRV-8842."},
23+
{SenderJID: "alice@s.whatsapp.net", SenderName: "Alice", ChatJID: "alice@s.whatsapp.net", ChatName: "Alice", Text: "Can you bring the Nakamura prototype to the offsite? Serial number NK-2047."},
24+
})
25+
26+
a := fx.Agent(t)
27+
ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
28+
defer cancel()
29+
30+
prompt := "Summarize all communications from Alice across both email and WhatsApp."
31+
result, err := a.Run(ctx, prompt)
32+
if err != nil {
33+
t.Fatalf("agent.Run: %v", err)
34+
}
35+
36+
AssertNotEmpty(t, result)
37+
AssertJudge(t, fx.Provider, fx.Model, prompt, result,
38+
"The response must cover topics from BOTH email and WhatsApp. "+
39+
"It should mention the Q3 budget review or Portland offsite from email, AND reference "+
40+
"Trattoria Vecchia, TRV-8842, Nakamura prototype, or NK-2047 from WhatsApp. "+
41+
"It should not claim that data from one source is missing if it was provided.")
42+
})
43+
}
44+
45+
// TestSpec_RecallMemoryAndCorrelateEmails seeds personal memories about a
46+
// relationship and emails with project details. The agent must autonomously
47+
// check memories for context about the person, search emails for specifics,
48+
// and combine both into a coherent answer in a single turn.
49+
func TestSpec_RecallMemoryAndCorrelateEmails(t *testing.T) {
50+
EachProvider(t, func(t *testing.T, fx *LocalFixture) {
51+
fx.GivenMemories(t, []UserMemory{
52+
{Content: "Raj Patel is my tech lead at Zephyr Industries", Category: "relationship"},
53+
{Content: "Project Firebird has a hard deadline of June 15, 2025", Category: "project"},
54+
})
55+
56+
fx.GivenEmails(t, []Email{
57+
{From: "raj.patel@zephyr.io", To: "me@example.com", Subject: "Project Firebird Sprint 7 Retro", Body: "Sprint 7 retro is scheduled for May 22. Please prepare your notes on the auth module refactor."},
58+
{From: "raj.patel@zephyr.io", To: "me@example.com", Subject: "Project Firebird Launch Prep", Body: "Client confirmed the staging demo for June 10. We need all QA passed by June 8."},
59+
})
60+
61+
a := fx.Agent(t)
62+
ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
63+
defer cancel()
64+
65+
prompt := "Tell me everything about Raj and Project Firebird. Check both my memories and emails."
66+
result, err := a.Run(ctx, prompt)
67+
if err != nil {
68+
t.Fatalf("agent.Run: %v", err)
69+
}
70+
71+
AssertNotEmpty(t, result)
72+
AssertJudge(t, fx.Provider, fx.Model, prompt, result,
73+
"The response must include information from BOTH memories and emails. "+
74+
"It should mention Raj Patel is the tech lead at Zephyr Industries (from memory) AND mention "+
75+
"email subjects or content about Project Firebird Sprint 7 and Launch Prep (from emails). "+
76+
"It should not only use one source.")
77+
})
78+
}

spectest/email_query_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package spectest
2+
3+
import (
4+
"context"
5+
"testing"
6+
"time"
7+
)
8+
9+
func TestSpec_FindEmailsBySender(t *testing.T) {
10+
EachProvider(t, func(t *testing.T, fx *LocalFixture) {
11+
fx.GivenEmails(t, []Email{
12+
{From: "alice@example.com", Subject: "Meeting Tomorrow", Body: "Let's meet at 2pm"},
13+
{From: "bob@example.com", Subject: "Project Update", Body: "Here is the latest"},
14+
{From: "alice@example.com", Subject: "Lunch Plans", Body: "Friday lunch?"},
15+
})
16+
17+
a := fx.Agent(t)
18+
ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
19+
defer cancel()
20+
21+
prompt := "Find emails from Alice"
22+
result, err := a.Run(ctx, prompt)
23+
if err != nil {
24+
t.Fatalf("agent.Run: %v", err)
25+
}
26+
27+
AssertNotEmpty(t, result)
28+
AssertJudge(t, fx.Provider, fx.Model, prompt, result,
29+
"The response must list Alice's emails. It should mention both 'Meeting Tomorrow' and 'Lunch Plans' subjects. "+
30+
"It should NOT include Bob's 'Project Update' email.")
31+
})
32+
}

spectest/fixture.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package spectest
2+
3+
import (
4+
"testing"
5+
6+
"github.com/priyanshujain/openbotkit/agent"
7+
)
8+
9+
type Email struct {
10+
MessageID string
11+
Account string
12+
From string
13+
To string
14+
Subject string
15+
Body string
16+
}
17+
18+
type WhatsAppMessage struct {
19+
MessageID string
20+
ChatJID string
21+
ChatName string
22+
SenderJID string
23+
SenderName string
24+
Text string
25+
IsFromMe bool
26+
}
27+
28+
type UserMemory struct {
29+
Content string
30+
Category string // identity, preference, relationship, project
31+
}
32+
33+
type Fixture interface {
34+
Agent(t *testing.T) *agent.Agent
35+
GivenEmails(t *testing.T, emails []Email)
36+
GivenWhatsAppMessages(t *testing.T, messages []WhatsAppMessage)
37+
GivenMemories(t *testing.T, memories []UserMemory)
38+
}

0 commit comments

Comments
 (0)