BackendStack21 · jkyberneees · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/docs/CHEATSHEET.md b/docs/CHEATSHEET.md
@@ -122,12 +122,13 @@ Settings: `model` (tiny/base/small/medium), `language` (ISO code, empty=auto), `
 ```
 RP.embed(newEntry) → cosine similarity vs each existing entry
 
-  cos > 0.7  ─────→ auto-merge (replace)
+  cos > 0.7  ─────→ simple merge (no LLM — substring or concatenation)
   cos < 0.3  ─────→ auto-add
-  0.3–0.7   ─────→ LLM judges → merge or add
+  0.3–0.7   ─────→ auto-add (deferred to session-end consolidation)
 ```
 
-Saves ~80% of LLM calls on memory writes.
+AddFact makes zero LLM calls. Near-duplicate dedup happens at session end
+via background consolidation (`consolidate_on_end`, default true).
 
 ### Memory Tool
 

diff --git a/docs/CONFIG.md b/docs/CONFIG.md
@@ -195,6 +195,7 @@ The `memory` section controls the persistent memory system (see [docs/MEMORY.md]
     "buffer_lines": 20,
     "buffer_enabled": true,
     "merge_on_write": true,
+    "consolidate_on_end": true,
     "extract_on_end": true,
     "extract_facts": false,
     "llm_search": true,
@@ -214,7 +215,8 @@ The `memory` section controls the persistent memory system (see [docs/MEMORY.md]
 | `facts_limit_env` | 2500 | Max chars for `env.md` fact file |
 | `buffer_lines` | 20 | Max turn summaries in session buffer |
 | `buffer_enabled` | true | Enable the turn-level buffer |
-| `merge_on_write` | true | Use go-vector RP similarity to auto-merge related entries |
+| `merge_on_write` | true | Use go-vector RP similarity to auto-merge related entries (fast, no LLM — uses simple string merge) |
+| `consolidate_on_end` | true | At session end, run an LLM consolidation pass over `user.md` and `env.md` in a background goroutine. This is the quality complement to `merge_on_write`: merge-on-write handles obvious duplicates immediately (no LLM), while consolidation handles near-duplicates and paraphrases at session end with full LLM quality. Requires `llm_consolidate: true`. **Note:** facts in the borderline similarity band (0.3–0.7 cosine) are now always added immediately and only merged by this consolidation pass — if you set `consolidate_on_end: false`, near-duplicate facts will accumulate rather than being merged. |
 | `extract_on_end` | true | At session end (≥3 turns), extract a narrative episode summary via LLM for later recall |
 | `extract_facts` | **false** | **Opt-in.** At session end (≥3 turns), auto-extract a few **durable** facts (stable user preferences, project invariants) into `user.md`/`env.md`. Off by default — see the security note below. Independent of `extract_on_end`; to disable *all* end-of-session LLM extraction set `llm_extract: false`. |
 | `llm_search` | true | Use LLM to rerank candidates for **explicit** `memory search` calls (the `memory` tool). Per-turn recall (`FormatEpisodeContext`) always uses the cached go-vector index — no LLM call on the hot path regardless of this setting. |

diff --git a/internal/config/loader.go b/internal/config/loader.go
@@ -826,6 +826,9 @@ func resolveMemory(cfg *memory.MemoryConfig) memory.MemoryConfig {
 	if cfg.ExtractFacts != nil {
 		def.ExtractFacts = cfg.ExtractFacts
 	}
+	if cfg.ConsolidateOnEnd != nil {
+		def.ConsolidateOnEnd = cfg.ConsolidateOnEnd
+	}
 	if cfg.LLMSearch != nil {
 		def.LLMSearch = cfg.LLMSearch
 	}

diff --git a/internal/memory/async_merge_test.go b/internal/memory/async_merge_test.go
@@ -0,0 +1,187 @@
+package memory
+
+import (
+	"context"
+	"strings"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+// callCountLLM counts every LLM invocation and proxies to a mockLLM.
+type callCountLLM struct {
+	calls int64
+	inner *mockLLM
+}
+
+func (c *callCountLLM) SimpleCall(ctx context.Context, system, user string) (string, error) {
+	atomic.AddInt64(&c.calls, 1)
+	return c.inner.SimpleCall(ctx, system, user)
+}
+
+// TestAddFact_NoLLMCalls: AddFact must complete without making any LLM calls,
+// even when merge-on-write classifies a new entry as "merge" or "judge".
+func TestAddFact_NoLLMCalls(t *testing.T) {
+	dir := t.TempDir()
+	llm := &callCountLLM{inner: &mockLLM{responses: map[string]string{}}}
+	mm := NewMemoryManager(dir, llm, DefaultMemoryConfig())
+
+	// Seed an entry so subsequent adds trigger merge-on-write comparisons.
+	if err := mm.AddFact("user", "project uses postgres for all data storage"); err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+	before := atomic.LoadInt64(&llm.calls)
+
+	// Add a very similar entry — should be classified "merge" or "judge" by RP.
+	if err := mm.AddFact("user", "project database is postgres"); err != nil {
+		t.Fatalf("AddFact: %v", err)
+	}
+	after := atomic.LoadInt64(&llm.calls)
+
+	if after != before {
+		t.Errorf("AddFact made %d LLM call(s); want 0 — AddFact must never block on LLM",
+			after-before)
+	}
+}
+
+// TestAddFact_MergeUsesSimpleFallback: when merge-on-write fires, the entries
+// are merged using the non-LLM (simple) path. The result must contain content
+// from both entries OR one must be a substring of the other.
+func TestAddFact_MergeUsesSimpleFallback(t *testing.T) {
+	dir := t.TempDir()
+	cfg := DefaultMemoryConfig()
+	cfg.MergeOnWrite = boolPtr(true)
+	// Use a high merge threshold so the similar entries definitely trigger "merge".
+	cfg.MergeThreshold = 0.01 // effectively always merge
+	mm := NewMemoryManager(dir, nil, cfg)
+
+	_ = mm.AddFact("env", "go 1.22")
+	_ = mm.AddFact("env", "golang 1.22")
+
+	_, env, err := mm.ReadFacts()
+	if err != nil {
+		t.Fatalf("ReadFacts: %v", err)
+	}
+	// The simple merge either returns the longer entry (substring case) or
+	// concatenates them. Either way the result is non-empty.
+	if strings.TrimSpace(env) == "" {
+		t.Errorf("expected a merged entry in env, got empty")
+	}
+}
+
+// TestConsolidateOnEnd_Default: the default config has consolidate_on_end=true.
+func TestConsolidateOnEnd_Default(t *testing.T) {
+	d := DefaultMemoryConfig()
+	if d.ConsolidateOnEnd == nil || !*d.ConsolidateOnEnd {
+		t.Errorf("ConsolidateOnEnd default should be true, got %v", d.ConsolidateOnEnd)
+	}
+}
+
+// TestConsolidateOnEnd_FiresAtSessionEnd: with consolidate_on_end=true, facts
+// are consolidated in the background at session end. We verify this by seeding
+// three distinct facts and confirming the count decreases (Consolidate merged
+// them) within a generous timeout.
+func TestConsolidateOnEnd_FiresAtSessionEnd(t *testing.T) {
+	dir := t.TempDir()
+	// LLM that:
+	//  - returns "session summary" for the episode extraction call
+	//  - returns a 2-element JSON array for any consolidation call
+	llm := &mockLLM{responses: map[string]string{
+		"Summarize": "session summary",
+		// Consolidate prompt contains "memory entries" — return a merged 2-item list
+		"memory entri": `["dark mode + vim keybindings preference","works in Go for backend"]`,
+	}}
+
+	cfg := DefaultMemoryConfig()
+	cfg.ConsolidateOnEnd = boolPtr(true)
+	cfg.LLMConsolidate = boolPtr(true)
+	cfg.ExtractOnEnd = boolPtr(true)
+	cfg.MergeOnWrite = boolPtr(false) // keep all three facts separate
+	mm := NewMemoryManager(dir, llm, cfg)
+
+	// Seed three facts that will survive without merge-on-write.
+	_ = mm.AddFact("user", "prefers dark mode in all editors")
+	_ = mm.AddFact("user", "uses vim keybindings everywhere")
+	_ = mm.AddFact("user", "works primarily in Go for backend services")
+
+	entries0, _ := mm.facts.Entries("user")
+	if len(entries0) != 3 {
+		t.Fatalf("expected 3 seeded entries, got %d", len(entries0))
+	}
+
+	msgs := []string{"user: hi", "assistant: ok", "user: more", "assistant: done"}
+	mm.OnSessionEndWithProvenance("20260801-a", 5, msgs, EpisodeProvenance{})
+
+	// Poll until consolidation reduces the entry count or timeout.
+	deadline := time.Now().Add(3 * time.Second)
+	for time.Now().Before(deadline) {
+		entries, _ := mm.facts.Entries("user")
+		if len(entries) < 3 {
+			return // consolidation ran and reduced entries — success
+		}
+		time.Sleep(50 * time.Millisecond)
+	}
+	t.Error("session-end consolidation did not reduce fact count within 3 seconds")
+}
+
+// TestConsolidateOnEnd_IndependentOfLLMExtract: consolidation must fire even
+// when llm_extract=false (episode extraction disabled). D-06 regression guard.
+func TestConsolidateOnEnd_IndependentOfLLMExtract(t *testing.T) {
+	dir := t.TempDir()
+	llm := &mockLLM{responses: map[string]string{
+		"memory entri": `["consolidated single fact"]`,
+	}}
+	cfg := DefaultMemoryConfig()
+	cfg.LLMExtract = boolPtr(false) // episodes off — must NOT suppress consolidation
+	cfg.ConsolidateOnEnd = boolPtr(true)
+	cfg.MergeOnWrite = boolPtr(false)
+	mm := NewMemoryManager(dir, llm, cfg)
+	_ = mm.AddFact("user", "prefers dark mode editors")
+	_ = mm.AddFact("user", "uses dark theme always")
+	entries0, _ := mm.facts.Entries("user")
+	if len(entries0) < 2 {
+		t.Fatalf("need 2 seeded entries, got %d", len(entries0))
+	}
+	msgs := []string{"user: hi", "assistant: ok", "user: more", "assistant: done"}
+	mm.OnSessionEndWithProvenance("sess-d06", 5, msgs, EpisodeProvenance{})
+	deadline := time.Now().Add(3 * time.Second)
+	for time.Now().Before(deadline) {
+		if e, _ := mm.facts.Entries("user"); len(e) < 2 {
+			return // consolidation fired
+		}
+		time.Sleep(50 * time.Millisecond)
+	}
+	t.Error("consolidation should fire even with llm_extract=false")
+}
+
+// TestConsolidateOnEnd_FlagOff: with consolidate_on_end=false, fact count must
+// remain stable at session end (no consolidation LLM call).
+func TestConsolidateOnEnd_FlagOff(t *testing.T) {
+	dir := t.TempDir()
+	llm := &callCountLLM{
+		inner: &mockLLM{responses: map[string]string{"Summarize": "summary"}},
+	}
+	cfg := DefaultMemoryConfig()
+	cfg.ConsolidateOnEnd = boolPtr(false)
+	cfg.MergeOnWrite = boolPtr(false)
+	mm := NewMemoryManager(dir, llm, cfg)
+
+	_ = mm.AddFact("user", "prefers dark mode")
+	_ = mm.AddFact("user", "uses Go for backend work")
+
+	msgs := []string{"user: hi", "assistant: ok", "user: more", "assistant: done"}
+	before := atomic.LoadInt64(&llm.calls)
+	mm.OnSessionEndWithProvenance("20260801-b", 5, msgs, EpisodeProvenance{})
+
+	// Give any goroutine 300 ms to (incorrectly) run.
+	time.Sleep(300 * time.Millisecond)
+	after := atomic.LoadInt64(&llm.calls)
+
+	// Only the episode extraction call should have fired (Summarize), not Consolidate.
+	episodeCalls := after - before
+	entries, _ := mm.facts.Entries("user")
+	if len(entries) < 2 {
+		t.Errorf("consolidate_on_end=false must not consolidate facts, got %d entries", len(entries))
+	}
+	_ = episodeCalls // 1 call for Summarize is expected
+}
diff --git a/internal/memory/facts.go b/internal/memory/facts.go
@@ -304,7 +304,13 @@ func (f *FactStore) Entries(target string) ([]string, error) {
 // writeEntries joins entries and writes them to disk atomically. It writes to a
 // UNIQUE temp file in the same directory and renames it into place, so two
 // FactStore instances writing the same directory concurrently can never clobber
-// a shared temp file. Caller must hold f.mu.
+// a shared temp file.
+//
+// Locking contract: callers must hold EITHER f.mu (normal mutation paths via
+// readModifyWrite) OR the process-wide factsDirLock for this directory
+// (Consolidate path in memory.go). These two locks are never acquired together,
+// so there is no deadlock risk. factsDirLock provides the same cross-instance
+// mutual exclusion that f.mu provides per-instance.
 func (f *FactStore) writeEntries(target string, entries []string) error {
 	content := strings.Join(entries, entrySep)
 	path := f.path(target)

diff --git a/internal/memory/memory.go b/internal/memory/memory.go
@@ -64,6 +64,7 @@ type MemoryConfig struct {
 	MergeOnWrite          *bool   `json:"merge_on_write,omitempty"`
 	ExtractOnEnd          *bool   `json:"extract_on_end,omitempty"`
 	ExtractFacts          *bool   `json:"extract_facts,omitempty"`
+	ConsolidateOnEnd      *bool   `json:"consolidate_on_end,omitempty"`
 	LLMSearch             *bool   `json:"llm_search,omitempty"`
 	LLMExtract            *bool   `json:"llm_extract,omitempty"`
 	LLMConsolidate        *bool   `json:"llm_consolidate,omitempty"`
@@ -95,6 +96,7 @@ func DefaultMemoryConfig() MemoryConfig {
 		MergeOnWrite:          boolPtr(true),
 		ExtractOnEnd:          boolPtr(true),
 		ExtractFacts:          boolPtr(false), // opt-in: persistent-poisoning risk, see SECURITY.md
+		ConsolidateOnEnd:      boolPtr(true),  // restores LLM merge quality removed from AddFact
 		LLMSearch:             boolPtr(true),  // LLM ranker by default — relevance over recency
 		LLMExtract:            boolPtr(true),
 		LLMConsolidate:        boolPtr(true),
@@ -152,6 +154,9 @@ func NewMemoryManager(memoryDir string, llc LLMClient, cfg MemoryConfig) *Memory
 	if cfg.ExtractFacts != nil {
 		def.ExtractFacts = cfg.ExtractFacts
 	}
+	if cfg.ConsolidateOnEnd != nil {
+		def.ConsolidateOnEnd = cfg.ConsolidateOnEnd
+	}
 	if cfg.LLMSearch != nil {
 		def.LLMSearch = cfg.LLMSearch
 	}
@@ -249,32 +254,21 @@ func (m *MemoryManager) AddFact(target, content string) error {
 
 			switch action {
 			case "merge":
-				// Auto-merge: replace similar entry with merged content
-				merged := mergeEntries(m.llm, entries[similarIdx], content)
+				// Auto-merge using the fast (non-LLM) path so AddFact never blocks
+				// on a network round-trip. The simple merge handles the common
+				// substring case well; LLM quality is recovered at session end by
+				// the background consolidation that runs when consolidate_on_end=true.
+				merged := mergeEntries(nil, entries[similarIdx], content)
 				if err := m.facts.Replace(target, entries[similarIdx][:min(30, len(entries[similarIdx]))], merged); err != nil {
 					return err
 				}
 				// Update merge detector incrementally — only re-embed the changed entry
 				m.merge.ReplaceEntry(similarIdx, merged)
 				return nil
 			case "judge":
-				// Borderline: use LLM to decide
-				if m.llm != nil {
-					decision, err := m.judgeMerge(target, entries[similarIdx], content)
-					if err == nil {
-						if decision == "merge" {
-							merged := mergeEntries(m.llm, entries[similarIdx], content)
-							if err := m.facts.Replace(target, entries[similarIdx][:min(30, len(entries[similarIdx]))], merged); err != nil {
-								return err
-							}
-							// Update merge detector incrementally
-							m.merge.ReplaceEntry(similarIdx, merged)
-							return nil
-						}
-						// decision == "add" — fall through to normal add
-					}
-				}
-				// No LLM available or LLM failed: let agent decide (just add)
+				// Borderline similarity — add without blocking on an LLM judgment
+				// call. Brief duplication (until session-end consolidation) is
+				// preferable to stalling the agent loop for a round-trip.
 				fallthrough
 			case "add":
 				// No overlap — normal add
@@ -423,6 +417,11 @@ Entries for %s:
 	if len(newEntries) == 0 || (len(newEntries) == 1 && newEntries[0] == "") {
 		return nil // LLM returned nothing useful
 	}
+	// Guard against a hallucinating LLM expanding the entry count; the system
+	// prompt says "Never more than the original count" but that is advisory only.
+	if len(newEntries) > len(entries) {
+		newEntries = newEntries[:len(entries)]
+	}
 
 	// Security: scan LLM output before persisting
 	for _, entry := range newEntries {
@@ -520,7 +519,24 @@ func (m *MemoryManager) OnSessionEndWithProvenance(sessionID string, turns int,
 	if minTurns <= 0 {
 		minTurns = defaultMinTurnsForExtraction
 	}
-	// Shared preconditions for any end-of-session LLM extraction.
+
+	// Background consolidation is independent of episode/fact extraction —
+	// it fires based on its own gate so that llm_extract=false does not
+	// silently disable it (D-06). Requires an LLM client, llm_consolidate,
+	// and a minimum session length (same threshold reused for consistency).
+	if m.llm != nil && turns >= minTurns &&
+		m.cfg.ConsolidateOnEnd != nil && *m.cfg.ConsolidateOnEnd &&
+		m.cfg.LLMConsolidate != nil && *m.cfg.LLMConsolidate {
+		go func() {
+			for _, target := range []string{"user", "env"} {
+				// Best-effort: errors (e.g. only 1 entry, nothing to consolidate)
+				// are silently ignored — consolidation is a quality pass, not critical.
+				_ = m.Consolidate(target)
+			}
+		}()
+	}
+
+	// Preconditions shared by episode summary + fact extraction.
 	if m.cfg.LLMExtract == nil || !*m.cfg.LLMExtract || m.llm == nil || turns < minTurns || len(messages) == 0 {
 		return
 	}
@@ -822,34 +838,6 @@ func (m *MemoryManager) BuildSystemPrompt() string {
 
 // ── Private helpers ──────────────────────────────────────────────────
 
-// judgeMerge asks the LLM whether two entries should be merged.
-// Returns "merge" or "add".
-func (m *MemoryManager) judgeMerge(target, existing, newEntry string) (string, error) {
-	prompt := fmt.Sprintf(`I have two memory entries for the "%s" category:
-
-EXISTING: %s
-NEW: %s
-
-Should the new entry be MERGED into the existing one (they are related or redundant)
-or ADDED as a separate entry (they are distinct topics)?
-
-Reply with exactly one word: "merge" or "add"`, target, existing, newEntry)
-
-	decision, err := m.llm.SimpleCall(context.Background(),
-		"You are a memory deduplication system. Reply with exactly one word: 'merge' or 'add'.",
-		prompt,
-	)
-	if err != nil {
-		return "add", err
-	}
-
-	decision = strings.TrimSpace(strings.ToLower(decision))
-	if strings.Contains(decision, "merge") {
-		return "merge", nil
-	}
-	return "add", nil
-}
-
 // mergeEntries combines two related entries into one.
 // When an LLM client is available, uses semantic merging for higher quality.
 // Falls back to simple string logic when LLM is unavailable.