test(memory): add mock LLM tests and integration tests

priyanshujain · priyanshujain · commit be4a5b5bcf0e · 2026-03-09T15:31:33.000+07:00
Unit tests: Extract with mock LLM (verifies prompt construction,
JSON parsing, filtering), Reconcile with mock (NOOP, UPDATE,
DELETE, ADD decisions).

Integration tests: real LLM extraction and end-to-end
extract→reconcile flow, skipped when no API keys set.
diff --git a/memory/extract_test.go b/memory/extract_test.go
@@ -1,6 +1,30 @@
 package memory
 
-import "testing"
+import (
+	"context"
+	"testing"
+
+	"github.com/priyanshujain/openbotkit/provider"
+)
+
+type mockLLM struct {
+	response string
+	err      error
+	lastReq  *provider.ChatRequest
+}
+
+func (m *mockLLM) Chat(_ context.Context, req provider.ChatRequest) (*provider.ChatResponse, error) {
+	m.lastReq = &req
+	if m.err != nil {
+		return nil, m.err
+	}
+	return &provider.ChatResponse{
+		Content: []provider.ContentBlock{
+			{Type: provider.ContentText, Text: m.response},
+		},
+		StopReason: provider.StopEndTurn,
+	}, nil
+}
 
 func TestPreFilter(t *testing.T) {
 	messages := []string{
@@ -102,3 +126,70 @@ func TestIsAck(t *testing.T) {
 		}
 	}
 }
+
+func TestExtractWithMockLLM(t *testing.T) {
+	llm := &mockLLM{
+		response: `[{"content": "User prefers dark mode", "category": "preference"}, {"content": "User's name is Priyanshu", "category": "identity"}]`,
+	}
+
+	messages := []string{
+		"My name is Priyanshu and I prefer dark mode in all my editors",
+		"I've been working on this project for a while now",
+	}
+
+	facts, err := Extract(context.Background(), llm, messages)
+	if err != nil {
+		t.Fatalf("Extract: %v", err)
+	}
+	if len(facts) != 2 {
+		t.Fatalf("expected 2 facts, got %d", len(facts))
+	}
+	if facts[0].Content != "User prefers dark mode" {
+		t.Errorf("fact[0].Content = %q", facts[0].Content)
+	}
+	if facts[1].Category != "identity" {
+		t.Errorf("fact[1].Category = %q", facts[1].Category)
+	}
+
+	// Verify the prompt was constructed correctly.
+	if llm.lastReq == nil {
+		t.Fatal("expected LLM to be called")
+	}
+	if llm.lastReq.System != extractionPrompt {
+		t.Error("expected system prompt to be the extraction prompt")
+	}
+	if len(llm.lastReq.Messages) != 1 {
+		t.Fatalf("expected 1 message, got %d", len(llm.lastReq.Messages))
+	}
+}
+
+func TestExtractAllFiltered(t *testing.T) {
+	llm := &mockLLM{response: "should not be called"}
+
+	messages := []string{"ok", "yes", "thanks"}
+
+	facts, err := Extract(context.Background(), llm, messages)
+	if err != nil {
+		t.Fatalf("Extract: %v", err)
+	}
+	if len(facts) != 0 {
+		t.Fatalf("expected 0 facts, got %d", len(facts))
+	}
+	if llm.lastReq != nil {
+		t.Error("LLM should not have been called when all messages filtered")
+	}
+}
+
+func TestExtractEmptyResponse(t *testing.T) {
+	llm := &mockLLM{response: `[]`}
+
+	messages := []string{"I've been thinking about this problem for a while"}
+
+	facts, err := Extract(context.Background(), llm, messages)
+	if err != nil {
+		t.Fatalf("Extract: %v", err)
+	}
+	if len(facts) != 0 {
+		t.Fatalf("expected 0 facts, got %d", len(facts))
+	}
+}
diff --git a/memory/integration_test.go b/memory/integration_test.go
@@ -0,0 +1,165 @@
+package memory
+
+import (
+	"context"
+	"os"
+	"testing"
+
+	"github.com/priyanshujain/openbotkit/provider"
+	"github.com/priyanshujain/openbotkit/provider/anthropic"
+	"github.com/priyanshujain/openbotkit/provider/gemini"
+	"github.com/priyanshujain/openbotkit/provider/openai"
+)
+
+type providerTestCase struct {
+	name     string
+	provider provider.Provider
+	model    string
+}
+
+func availableProviders(t *testing.T) []providerTestCase {
+	t.Helper()
+	var providers []providerTestCase
+
+	if key := os.Getenv("ANTHROPIC_API_KEY"); key != "" {
+		providers = append(providers, providerTestCase{
+			name:     "anthropic",
+			provider: anthropic.New(key),
+			model:    "claude-sonnet-4-6",
+		})
+	}
+	if key := os.Getenv("OPENAI_API_KEY"); key != "" {
+		providers = append(providers, providerTestCase{
+			name:     "openai",
+			provider: openai.New(key),
+			model:    "gpt-4o-mini",
+		})
+	}
+	if key := os.Getenv("GEMINI_API_KEY"); key != "" {
+		providers = append(providers, providerTestCase{
+			name:     "gemini",
+			provider: gemini.New(key),
+			model:    "gemini-2.0-flash",
+		})
+	}
+
+	if len(providers) == 0 {
+		t.Skip("no API keys set — skipping integration tests (set ANTHROPIC_API_KEY, OPENAI_API_KEY, or GEMINI_API_KEY)")
+	}
+	return providers
+}
+
+type providerLLM struct {
+	p     provider.Provider
+	model string
+}
+
+func (pl *providerLLM) Chat(ctx context.Context, req provider.ChatRequest) (*provider.ChatResponse, error) {
+	req.Model = pl.model
+	return pl.p.Chat(ctx, req)
+}
+
+func TestIntegration_Extract(t *testing.T) {
+	for _, tc := range availableProviders(t) {
+		t.Run(tc.name, func(t *testing.T) {
+			llm := &providerLLM{p: tc.provider, model: tc.model}
+
+			messages := []string{
+				"My name is Alice and I'm a software engineer at TechCorp",
+				"I really prefer using Go for backend development over Python",
+				"I'm currently building a personal assistant called BotKit",
+			}
+
+			facts, err := Extract(context.Background(), llm, messages)
+			if err != nil {
+				t.Fatalf("Extract: %v", err)
+			}
+
+			if len(facts) == 0 {
+				t.Fatal("expected at least 1 fact extracted")
+			}
+
+			// Verify facts have valid categories.
+			validCategories := map[string]bool{
+				"identity": true, "preference": true,
+				"relationship": true, "project": true,
+			}
+			for _, f := range facts {
+				if f.Content == "" {
+					t.Error("fact has empty content")
+				}
+				if !validCategories[f.Category] {
+					t.Errorf("fact has invalid category %q: %q", f.Category, f.Content)
+				}
+			}
+		})
+	}
+}
+
+func TestIntegration_ExtractAndReconcile(t *testing.T) {
+	for _, tc := range availableProviders(t) {
+		t.Run(tc.name, func(t *testing.T) {
+			db := testDB(t)
+			if err := Migrate(db); err != nil {
+				t.Fatalf("migrate: %v", err)
+			}
+
+			llm := &providerLLM{p: tc.provider, model: tc.model}
+
+			messages := []string{
+				"My name is Bob and I live in San Francisco",
+				"I prefer dark mode in all my code editors",
+				"I'm working on an open source project called DataFlow",
+			}
+
+			// Extract facts.
+			facts, err := Extract(context.Background(), llm, messages)
+			if err != nil {
+				t.Fatalf("Extract: %v", err)
+			}
+			if len(facts) == 0 {
+				t.Fatal("expected at least 1 fact")
+			}
+
+			// Reconcile into empty DB (should all ADD).
+			result, err := Reconcile(context.Background(), db, llm, facts)
+			if err != nil {
+				t.Fatalf("Reconcile: %v", err)
+			}
+
+			if result.Added == 0 {
+				t.Error("expected at least 1 add")
+			}
+
+			count, _ := Count(db)
+			if count == 0 {
+				t.Fatal("expected memories in DB after reconciliation")
+			}
+
+			// Verify memories are retrievable.
+			memories, err := List(db)
+			if err != nil {
+				t.Fatalf("List: %v", err)
+			}
+			for _, m := range memories {
+				if m.Content == "" {
+					t.Error("memory has empty content")
+				}
+				if m.Source != "history" {
+					t.Errorf("memory source = %q, want 'history'", m.Source)
+				}
+			}
+
+			// Second extraction with same facts — should mostly NOOP/skip.
+			result2, err := Reconcile(context.Background(), db, llm, facts)
+			if err != nil {
+				t.Fatalf("second Reconcile: %v", err)
+			}
+
+			// Count should not have grown much (some ADD is OK if LLM decides differently).
+			count2, _ := Count(db)
+			t.Logf("first run: added=%d, count=%d; second run: added=%d, updated=%d, skipped=%d, count=%d",
+				result.Added, count, result2.Added, result2.Updated, result2.Skipped, count2)
+		})
+	}
+}
diff --git a/memory/reconcile_test.go b/memory/reconcile_test.go