From ca3f79fee595835b5dd7e1fd25919a5ee7156d2f Mon Sep 17 00:00:00 2001 From: Rolando Santamaria Maso Date: Sat, 6 Jun 2026 09:37:15 +0200 Subject: [PATCH 1/3] feat(memory): path-aware episode/skill provenance + human-gated promote MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the "episode dead-end": provenance tainted episodes by tool *name* only, so read_file/search_files/multi_grep — used in essentially every coding session — always produced an untrusted episode. Untrusted episodes are excluded from recall, and nothing ever set UserApproved, so for normal work the episode tier was written but never replayed (dead weight). Provenance is now argument-aware via a single shared predicate `memory.ToolCallTaints(name, argsJSON)`: - Always untrusted: browser, http_batch, transcribe, and any MCP tool (__). - Path-scoped (read_file, search_files, multi_grep): taint only when the `path` argument resolves to a sensitive location (danger.ClassifyPath → system_write/destructive, e.g. /etc, ~/.ssh, ~/.aws). Workspace and other non-sensitive local reads stay trusted. Malformed/absent path → conservative taint. This precisely re-targets the original "covers /etc/, $HOME/.ssh" concern while letting ordinary sessions stay recallable. The skills provenance (internal/skills/selfimprove.go) now delegates to the same predicate, so learned skills stop being over-flagged for review too. Reusing danger.ClassifyPath means DeriveProvenance keeps its (messages) signature — no cwd plumbing into the async session-end call sites. Escape hatch for genuinely-external episodes: new human-gated CLI `odek memory list` / `odek memory promote ` sets UserApproved. It is intentionally NOT an agent tool — a prompt-injected agent must not be able to self-approve its own poisoned memory. Also fixes two stale descriptions found while mapping the system: the extract_on_end docs ("durable facts" → episode summary) and the FormatEpisodeContext comment ("recency-based ranker, no LLM" → it uses the configured, LLM-by-default ranker). Tests: path-aware taint (workspace trusted, sensitive/system tainted, malformed conservative, always-external), the ToolCallTaints table, episode promote/pending lifecycle + persistence, skills mirror (guards the llm.Message→LlmMessage Arguments plumbing), and the CLI command end to end. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/odek/dispatch.go | 2 + cmd/odek/main.go | 5 + cmd/odek/memory_cmd.go | 66 +++++++++++++ cmd/odek/memory_cmd_test.go | 53 +++++++++++ docs/CONFIG.md | 2 +- docs/SECURITY.md | 16 +++- internal/memory/episodes.go | 53 +++++++++++ internal/memory/episodes_promote_test.go | 91 ++++++++++++++++++ internal/memory/memory.go | 29 +++++- internal/memory/provenance.go | 116 ++++++++++++++++++----- internal/memory/provenance_test.go | 114 +++++++++++++++++++++- internal/skills/provenance_test.go | 38 ++++++++ internal/skills/selfimprove.go | 23 ++--- 13 files changed, 562 insertions(+), 46 deletions(-) create mode 100644 cmd/odek/memory_cmd.go create mode 100644 cmd/odek/memory_cmd_test.go create mode 100644 internal/memory/episodes_promote_test.go diff --git a/cmd/odek/dispatch.go b/cmd/odek/dispatch.go index 95c9db2..58a6af2 100644 --- a/cmd/odek/dispatch.go +++ b/cmd/odek/dispatch.go @@ -53,6 +53,8 @@ func dispatch(args []string) int { return cliExit(telegramCmd(rest)) case "schedule": return cliExit(scheduleCmd(rest)) + case "memory": + return cliExit(memoryCmd(rest)) default: fmt.Fprintf(os.Stderr, "odek: unknown command %q\n", cmd) printUsage() diff --git a/cmd/odek/main.go b/cmd/odek/main.go index e0e5690..e7e5d0b 100644 --- a/cmd/odek/main.go +++ b/cmd/odek/main.go @@ -452,6 +452,7 @@ func printUsage() { odek mcp [--sandbox] odek telegram odek schedule + odek memory > odek version Commands: @@ -477,6 +478,10 @@ Commands: Subcommands: list, add, rm, enable, disable, run, next, daemon The daemon (or the Telegram bot) fires jobs and delivers results to stdout, a log, or a Telegram chat. + memory Review and promote past-session memory episodes + list: show episodes excluded from recall (untrusted) + promote : approve one so it can be recalled. + Human-gated on purpose — not available to the agent. init Create a config file (default: ./odek.json) version Print version and exit diff --git a/cmd/odek/memory_cmd.go b/cmd/odek/memory_cmd.go new file mode 100644 index 0000000..e011417 --- /dev/null +++ b/cmd/odek/memory_cmd.go @@ -0,0 +1,66 @@ +package main + +import ( + "fmt" + "os" + "strings" + + "github.com/BackendStack21/odek/internal/memory" +) + +// memoryCmd handles `odek memory [args]`. +// +// This is the human-gated surface for the episode-memory trust control. +// Episodes whose originating session touched external content (web/http/MCP/ +// audio, or reads of sensitive paths) are stored but excluded from recall +// until a human promotes them. Promotion lives HERE — on the CLI — and is +// deliberately NOT exposed as an agent tool, so a prompt-injected agent cannot +// approve its own poisoned memory. +func memoryCmd(args []string) error { + if len(args) == 0 { + fmt.Fprintf(os.Stderr, "Usage: odek memory [args]\n") + return nil + } + + dir := expandHome("~/.odek/memory") + store := memory.NewEpisodeStore(dir, nil) + + sub := args[0] + subArgs := args[1:] + + switch sub { + case "list", "ls", "pending": + pending, err := store.PendingReview() + if err != nil { + return err + } + if len(pending) == 0 { + fmt.Println("No episodes pending review — all stored episodes are recallable.") + return nil + } + fmt.Printf("%d episode(s) pending review (excluded from recall until promoted):\n\n", len(pending)) + for _, ep := range pending { + fmt.Printf("• %s (%d turns, %s)\n", ep.SessionID, ep.Turns, ep.CreatedAt.Format("2006-01-02 15:04")) + if len(ep.Provenance.Sources) > 0 { + fmt.Printf(" sources: %s\n", strings.Join(ep.Provenance.Sources, ", ")) + } + fmt.Printf(" %s\n\n", ep.Summary) + } + fmt.Println("Review the summary above, then promote with: odek memory promote ") + return nil + + case "promote": + if len(subArgs) == 0 { + return fmt.Errorf("usage: odek memory promote ") + } + id := subArgs[0] + if err := store.Promote(id); err != nil { + return err + } + fmt.Printf("odek: promoted episode %q — it can now be recalled into future sessions\n", id) + return nil + + default: + return fmt.Errorf("unknown memory subcommand %q (expected: list, promote)", sub) + } +} diff --git a/cmd/odek/memory_cmd_test.go b/cmd/odek/memory_cmd_test.go new file mode 100644 index 0000000..8d9b4a9 --- /dev/null +++ b/cmd/odek/memory_cmd_test.go @@ -0,0 +1,53 @@ +package main + +import ( + "path/filepath" + "testing" + + "github.com/BackendStack21/odek/internal/memory" +) + +// TestMemoryCmd_ListAndPromote exercises the human-gated promote path end to +// end through the CLI command: a seeded untrusted episode is pending, the +// command promotes it, and the approval is persisted to the on-disk index. +func TestMemoryCmd_ListAndPromote(t *testing.T) { + home := setupTestHome(t) + dir := filepath.Join(home, ".odek", "memory") + + es := memory.NewEpisodeStore(dir, nil) + if err := es.WriteWithProvenance("20260108-web", "researched a library", 5, + memory.EpisodeProvenance{Untrusted: true, Sources: []string{"browser"}}); err != nil { + t.Fatalf("seed: %v", err) + } + + if err := memoryCmd([]string{"list"}); err != nil { + t.Fatalf("memory list: %v", err) + } + if err := memoryCmd([]string{"promote", "20260108-web"}); err != nil { + t.Fatalf("memory promote: %v", err) + } + + fresh := memory.NewEpisodeStore(dir, nil) + idx, err := fresh.ReadIndex() + if err != nil { + t.Fatalf("read index: %v", err) + } + if len(idx) != 1 || !idx[0].Provenance.UserApproved { + t.Errorf("episode not approved after promote: %+v", idx) + } + + if err := memoryCmd([]string{"promote", "does-not-exist"}); err == nil { + t.Error("promoting an unknown id should error") + } + if err := memoryCmd([]string{"bogus"}); err == nil { + t.Error("unknown subcommand should error") + } +} + +// TestMemoryCmd_ListEmpty: list on a clean home must not error. +func TestMemoryCmd_ListEmpty(t *testing.T) { + setupTestHome(t) + if err := memoryCmd([]string{"list"}); err != nil { + t.Fatalf("memory list on empty home: %v", err) + } +} diff --git a/docs/CONFIG.md b/docs/CONFIG.md index ec4d8ff..97e42bf 100644 --- a/docs/CONFIG.md +++ b/docs/CONFIG.md @@ -213,7 +213,7 @@ The `memory` section controls the persistent memory system (see [docs/MEMORY.md] | `buffer_lines` | 20 | Max turn summaries in session buffer | | `buffer_enabled` | true | Enable the turn-level buffer | | `merge_on_write` | true | Use go-vector RP similarity to auto-merge related entries | -| `extract_on_end` | true | Extract durable facts via LLM at session end (≥3 turns) | +| `extract_on_end` | true | At session end (≥3 turns), extract a narrative episode summary via LLM for later recall | | `llm_search` | true | Use LLM to rank episode search results by relevance | | `llm_extract` | true | Use LLM for end-of-session fact extraction | | `llm_consolidate` | true | Use LLM to merge related fact entries | diff --git a/docs/SECURITY.md b/docs/SECURITY.md index 3eeb34c..ec258f3 100644 --- a/docs/SECURITY.md +++ b/docs/SECURITY.md @@ -100,11 +100,23 @@ Both: `internal/memory` tracks `EpisodeProvenance{Untrusted, Sources, UserApproved}` for every episode. An episode derived from a session that ingested untrusted content is **stored on disk for audit but never auto-replayed** into future sessions. This stops a single successful injection from becoming a persistent backdoor through the episode pipeline. -To use a tainted episode anyway, the user must explicitly promote it (set `UserApproved=true`). +Taint is decided per tool call by `memory.ToolCallTaints` (the single source of truth, shared with skills): + +- **Always untrusted:** `browser`, `http_batch`, `transcribe` (network/opaque-audio content), and any MCP tool (`server__tool`). +- **Path-scoped:** `read_file`, `search_files`, `multi_grep` taint **only** when their `path` argument resolves to a sensitive location (`danger.ClassifyPath` → `system_write`/`destructive`, e.g. `/etc`, `~/.ssh`, `~/.aws`). Reads confined to the workspace — or any other non-sensitive local path — stay trusted, so ordinary coding sessions remain recallable. A malformed/absent path is treated conservatively as untrusted. + +To use a tainted episode anyway, the user explicitly promotes it (sets `UserApproved=true`) from the CLI: + +``` +odek memory list # episodes excluded from recall, with their sources +odek memory promote # approve one after reviewing its summary +``` + +Promotion is **CLI-only and human-gated** — it is deliberately *not* exposed as an agent tool, so a prompt-injected agent cannot self-approve its own poisoned memory. ### 6. Skill provenance gate -`internal/skills` carries the same provenance model. Skills auto-saved from sessions that touched `browser` / `http_batch` / `read_file` / `search_files` / `multi_grep` / `transcribe` / any MCP tool are tagged with `Provenance.Untrusted=true` and `NeedsReview=true`. The skill loader pins those skills to the Lazy set regardless of their `auto_load` flag. +`internal/skills` carries the same provenance model and shares the exact taint decision (`memory.ToolCallTaints`). Skills auto-saved from sessions that crossed the trust boundary — `browser` / `http_batch` / `transcribe` / any MCP tool, or a `read_file` / `search_files` / `multi_grep` of a **sensitive** path — are tagged with `Provenance.Untrusted=true` and `NeedsReview=true`. The skill loader pins those skills to the Lazy set regardless of their `auto_load` flag. After reviewing the skill body, promote it: diff --git a/internal/memory/episodes.go b/internal/memory/episodes.go index 7834c3f..c1b2b33 100644 --- a/internal/memory/episodes.go +++ b/internal/memory/episodes.go @@ -244,6 +244,59 @@ func (e *EpisodeStore) Search(query string, limit int) ([]EpisodeMeta, error) { return ranked, nil } +// ── Promotion (human-gated escape hatch) ────────────────────────────── + +// Promote marks a tainted episode as user-approved so it can be replayed +// into future sessions. This is the human-gated escape hatch for episodes +// whose originating session legitimately touched external content. It is +// intentionally NOT exposed to the agent (only via `odek memory promote`) so +// that a prompt-injected agent cannot self-approve poisoned memory. +// +// Returns an error if the session is unknown or already approved. +func (e *EpisodeStore) Promote(sessionID string) error { + if err := session.ValidateSessionID(sessionID); err != nil { + return fmt.Errorf("memory: episodes promote: %w", err) + } + e.mu.Lock() + defer e.mu.Unlock() + + idx, err := e.ReadIndex() + if err != nil { + return err + } + found := false + for i := range idx { + if idx[i].SessionID == sessionID { + found = true + if idx[i].Provenance.UserApproved { + return fmt.Errorf("memory: episode %q is already approved", sessionID) + } + idx[i].Provenance.UserApproved = true + } + } + if !found { + return fmt.Errorf("memory: episode %q not found", sessionID) + } + return e.writeIndex(idx) +} + +// PendingReview returns the episodes that are untrusted and not yet +// user-approved — the ones currently excluded from recall that a user may +// want to promote. Ordered newest-first (as ReadIndex returns them). +func (e *EpisodeStore) PendingReview() ([]EpisodeMeta, error) { + idx, err := e.ReadIndex() + if err != nil { + return nil, err + } + var pending []EpisodeMeta + for _, ep := range idx { + if ep.Provenance.Untrusted && !ep.Provenance.UserApproved { + pending = append(pending, ep) + } + } + return pending, nil +} + // ── Index helpers ───────────────────────────────────────────────────── // addToIndex appends an entry to the index and writes it. diff --git a/internal/memory/episodes_promote_test.go b/internal/memory/episodes_promote_test.go new file mode 100644 index 0000000..688e9e6 --- /dev/null +++ b/internal/memory/episodes_promote_test.go @@ -0,0 +1,91 @@ +package memory + +import "testing" + +// TestEpisode_PromoteMakesRecallable is the escape-hatch test: a tainted +// episode is excluded from recall and listed as pending; after Promote it is +// recallable and no longer pending. +func TestEpisode_PromoteMakesRecallable(t *testing.T) { + es := NewEpisodeStore(t.TempDir(), nil) + if err := es.WriteWithProvenance("20260105-web", "researched a library", 5, + EpisodeProvenance{Untrusted: true, Sources: []string{"browser"}}); err != nil { + t.Fatalf("write: %v", err) + } + + // Pending before promote; excluded from Search. + pending, err := es.PendingReview() + if err != nil { + t.Fatalf("pending: %v", err) + } + if len(pending) != 1 || pending[0].SessionID != "20260105-web" { + t.Fatalf("PendingReview = %v, want the tainted episode", pending) + } + res, _ := es.Search("any", 10) + for _, ep := range res { + if ep.SessionID == "20260105-web" { + t.Fatal("tainted episode should be excluded from Search before promote") + } + } + + // Promote it. + if err := es.Promote("20260105-web"); err != nil { + t.Fatalf("promote: %v", err) + } + + // Now recallable and no longer pending. + pending, _ = es.PendingReview() + if len(pending) != 0 { + t.Errorf("PendingReview after promote = %v, want empty", pending) + } + res, _ = es.Search("any", 10) + var saw bool + for _, ep := range res { + if ep.SessionID == "20260105-web" { + saw = true + } + } + if !saw { + t.Errorf("promoted episode should be returned by Search, got %v", res) + } +} + +// TestEpisode_PromotePersists verifies the approval survives a fresh store +// (i.e. it was written back to the on-disk index). +func TestEpisode_PromotePersists(t *testing.T) { + dir := t.TempDir() + es := NewEpisodeStore(dir, nil) + if err := es.WriteWithProvenance("20260106-x", "s", 5, + EpisodeProvenance{Untrusted: true, Sources: []string{"http_batch"}}); err != nil { + t.Fatalf("write: %v", err) + } + if err := es.Promote("20260106-x"); err != nil { + t.Fatalf("promote: %v", err) + } + + fresh := NewEpisodeStore(dir, nil) + idx, err := fresh.ReadIndex() + if err != nil { + t.Fatalf("read index: %v", err) + } + if len(idx) != 1 || !idx[0].Provenance.UserApproved { + t.Errorf("UserApproved did not persist: %+v", idx) + } +} + +func TestEpisode_PromoteErrors(t *testing.T) { + es := NewEpisodeStore(t.TempDir(), nil) + if err := es.WriteWithProvenance("20260107-a", "s", 5, + EpisodeProvenance{Untrusted: true}); err != nil { + t.Fatalf("write: %v", err) + } + + if err := es.Promote("does-not-exist"); err == nil { + t.Error("promoting an unknown episode should error") + } + if err := es.Promote("20260107-a"); err != nil { + t.Fatalf("first promote: %v", err) + } + if err := es.Promote("20260107-a"); err == nil { + t.Error("promoting an already-approved episode should error") + } +} diff --git a/internal/memory/memory.go b/internal/memory/memory.go index 1f7ed9f..29bb764 100644 --- a/internal/memory/memory.go +++ b/internal/memory/memory.go @@ -499,10 +499,31 @@ func (m *MemoryManager) SearchEpisodes(query string, limit int) ([]EpisodeMeta, return m.episodes.Search(query, limit) } -// FormatEpisodeContext searches episodes with a recency-based ranker -// (no LLM — safe for per-turn use without recursion risk) and returns -// formatted context to inject as a system message. Returns empty string -// if no episodes found or memory is disabled. +// PromoteEpisode marks a tainted episode as user-approved so it can be +// recalled into future sessions. Human-gated escape hatch — see +// EpisodeStore.Promote. +func (m *MemoryManager) PromoteEpisode(sessionID string) error { + if m.cfg.Enabled == nil || !*m.cfg.Enabled { + return fmt.Errorf("memory: disabled") + } + return m.episodes.Promote(sessionID) +} + +// PendingReviewEpisodes lists episodes that are untrusted and not yet +// user-approved (currently excluded from recall). +func (m *MemoryManager) PendingReviewEpisodes() ([]EpisodeMeta, error) { + if m.cfg.Enabled == nil || !*m.cfg.Enabled { + return nil, fmt.Errorf("memory: disabled") + } + return m.episodes.PendingReview() +} + +// FormatEpisodeContext searches past episodes for ones relevant to the +// current task and returns formatted context to inject as a system message. +// Ranking uses the episode store's configured RankStrategy (the LLM ranker by +// default, RP similarity otherwise — see NewMemoryManager). Untrusted, +// unpromoted episodes are excluded by Search. Returns empty string if no +// episodes are found or memory is disabled. func (m *MemoryManager) FormatEpisodeContext(query string) string { if m.cfg.Enabled == nil || !*m.cfg.Enabled { return "" diff --git a/internal/memory/provenance.go b/internal/memory/provenance.go index 82caa02..d0b1129 100644 --- a/internal/memory/provenance.go +++ b/internal/memory/provenance.go @@ -1,8 +1,10 @@ package memory import ( + "encoding/json" "strings" + "github.com/BackendStack21/odek/internal/danger" "github.com/BackendStack21/odek/internal/llm" ) @@ -10,49 +12,117 @@ import ( // produced an episode. The default zero value means trusted. // // An untrusted episode is one whose originating session ingested -// content from outside the agent's trust boundary (fetched pages, files -// outside the working directory, MCP tool output, audio transcription). -// Such episodes are stored on disk for audit but are NEVER auto- -// replayed into future sessions — they must be explicitly promoted -// (UserApproved=true) by the user first. This stops a one-shot prompt -// injection from becoming a persistent backdoor. +// content from outside the agent's trust boundary (fetched pages, MCP +// tool output, audio transcription, or reads of sensitive system/ +// credential paths). Such episodes are stored on disk for audit but are +// NEVER auto-replayed into future sessions — they must be explicitly +// promoted (UserApproved=true) by the user first (see `odek memory +// promote`). This stops a one-shot prompt injection from becoming a +// persistent backdoor. type EpisodeProvenance struct { Untrusted bool `json:"untrusted,omitempty"` Sources []string `json:"sources,omitempty"` UserApproved bool `json:"user_approved,omitempty"` } -// UntrustedToolNames is the canonical set of tools whose results come from -// outside the agent's trust boundary. Any tool whose output reaches the -// model wrapped in belongs here. -// -// This is the single source of truth — skills/selfimprove.go imports it -// to derive skill provenance from the same definition. -var UntrustedToolNames = map[string]bool{ - "browser": true, - "http_batch": true, - "transcribe": true, - "read_file": true, // conservative: covers /etc/, $HOME/.ssh, etc. +// AlwaysExternalTools are tools whose RESULT content originates outside the +// agent's trust boundary regardless of their arguments: network fetches and +// opaque transcribed audio. A session that used any of these always produces +// an untrusted episode. +var AlwaysExternalTools = map[string]bool{ + "browser": true, + "http_batch": true, + "transcribe": true, +} + +// PathScopedTools are local file tools that only cross the trust boundary when +// they touch a SENSITIVE path (system or credential locations). Reads confined +// to the workspace — or any other non-sensitive local path — are trusted, so +// ordinary coding sessions no longer taint their episode just for reading +// project files. The per-call decision lives in ToolCallTaints. +var PathScopedTools = map[string]bool{ + "read_file": true, "search_files": true, "multi_grep": true, } +// UntrustedToolNames is the union of the two categories above. It is retained +// as the canonical "these tools can produce untrusted content" set for +// external references and documentation. The actual per-call decision is made +// by ToolCallTaints, which is argument-aware for the path-scoped tools. +var UntrustedToolNames = func() map[string]bool { + m := make(map[string]bool, len(AlwaysExternalTools)+len(PathScopedTools)) + for k := range AlwaysExternalTools { + m[k] = true + } + for k := range PathScopedTools { + m[k] = true + } + return m +}() + +// ToolCallTaints reports whether a single recorded tool call crossed the +// agent's trust boundary. It is the single source of truth shared by episode +// (memory) and skill (skills) provenance so the two stay in lockstep. +// +// - MCP adapter calls (name contains "__") always taint — third-party servers +// return arbitrary text. +// - AlwaysExternalTools always taint, regardless of arguments. +// - PathScopedTools taint only when their "path" argument resolves to a +// sensitive location (danger.ClassifyPath → SystemWrite/Destructive). An +// empty path means the tool defaults to the workspace (trusted); a +// malformed argument string is treated conservatively as tainting. +// - Everything else (shell, patch, write_file, …) is trusted. +func ToolCallTaints(name, argsJSON string) bool { + if strings.Contains(name, "__") { + return true + } + if AlwaysExternalTools[name] { + return true + } + if PathScopedTools[name] { + return pathArgIsSensitive(argsJSON) + } + return false +} + +// pathArgIsSensitive extracts the "path" argument of a path-scoped tool call +// and reports whether it points outside the trust boundary. All three +// path-scoped tools (read_file, search_files, multi_grep) use the "path" key. +func pathArgIsSensitive(argsJSON string) bool { + var a struct { + Path string `json:"path"` + } + if err := json.Unmarshal([]byte(argsJSON), &a); err != nil { + return true // can't determine the path → assume the worst + } + if strings.TrimSpace(a.Path) == "" { + return false // empty path defaults to the workspace — trusted + } + switch danger.ClassifyPath(a.Path) { + case danger.SystemWrite, danger.Destructive: + return true + default: + // LocalWrite (in-workspace, /tmp, and other non-sensitive local + // paths) and anything else is within the trust boundary. + return false + } +} + // DeriveProvenance walks a session's structured messages and returns // the provenance an episode derived from those messages should carry. -// A message qualifies as untrusted if it contains a tool call whose -// name is in untrustedToolNames OR follows the MCP adapter naming -// convention (server__tool). +// A message taints the episode if it contains a tool call that crossed +// the trust boundary per ToolCallTaints. func DeriveProvenance(messages []llm.Message) EpisodeProvenance { prov := EpisodeProvenance{} seen := make(map[string]bool) for _, m := range messages { for _, tc := range m.ToolCalls { - name := tc.Function.Name - tainted := UntrustedToolNames[name] || strings.Contains(name, "__") - if !tainted { + if !ToolCallTaints(tc.Function.Name, tc.Function.Arguments) { continue } prov.Untrusted = true + name := tc.Function.Name if !seen[name] { seen[name] = true prov.Sources = append(prov.Sources, name) diff --git a/internal/memory/provenance_test.go b/internal/memory/provenance_test.go index b3ec997..ce098fe 100644 --- a/internal/memory/provenance_test.go +++ b/internal/memory/provenance_test.go @@ -1,6 +1,8 @@ package memory import ( + "os" + "path/filepath" "testing" "github.com/BackendStack21/odek/internal/llm" @@ -15,6 +17,18 @@ func toolMsg(name string) llm.Message { } } +// toolMsgArgs builds an assistant message with one tool call carrying the +// given raw JSON arguments string (as recorded on real sessions). +func toolMsgArgs(name, argsJSON string) llm.Message { + tc := llm.ToolCall{} + tc.Function.Name = name + tc.Function.Arguments = argsJSON + return llm.Message{ + Role: "assistant", + ToolCalls: []llm.ToolCall{tc}, + } +} + func TestDeriveProvenance_Empty(t *testing.T) { prov := DeriveProvenance(nil) if prov.Untrusted { @@ -46,10 +60,104 @@ func TestDeriveProvenance_MCPAdapterTaints(t *testing.T) { } } -func TestDeriveProvenance_ReadFileTaints(t *testing.T) { - prov := DeriveProvenance([]llm.Message{toolMsg("read_file")}) +// ── Path-aware taint (the fix) ──────────────────────────────────────── + +// A read confined to the workspace (relative path) must NOT taint — this is +// the headline behavior change that makes episodes from normal coding +// sessions recallable again. +func TestDeriveProvenance_ReadFileWorkspaceTrusted(t *testing.T) { + for _, p := range []string{"internal/x.go", "./README.md", "cmd/odek/main.go"} { + prov := DeriveProvenance([]llm.Message{ + toolMsg("shell"), + toolMsgArgs("read_file", `{"path":"`+p+`"}`), + }) + if prov.Untrusted { + t.Errorf("read_file %q is in-workspace and must be trusted, got %+v", p, prov) + } + } +} + +// search_files / multi_grep with no path default to the workspace → trusted. +func TestDeriveProvenance_SearchDefaultPathTrusted(t *testing.T) { + msgs := []llm.Message{ + toolMsgArgs("search_files", `{"pattern":"TODO","file_glob":"*.go"}`), + toolMsgArgs("multi_grep", `{"patterns":["a","b"]}`), + } + prov := DeriveProvenance(msgs) + if prov.Untrusted { + t.Errorf("workspace-default search should be trusted, got %+v", prov) + } +} + +// A read of a sensitive system path still taints — the original concern the +// provenance control exists for. +func TestDeriveProvenance_ReadFileSensitivePathTaints(t *testing.T) { + prov := DeriveProvenance([]llm.Message{ + toolMsgArgs("read_file", `{"path":"/etc/passwd"}`), + }) if !prov.Untrusted { - t.Errorf("read_file should taint (conservative — file may be outside CWD), got %+v", prov) + t.Fatalf("/etc/passwd read must taint, got %+v", prov) + } + if len(prov.Sources) != 1 || prov.Sources[0] != "read_file" { + t.Errorf("Sources = %v, want [read_file]", prov.Sources) + } +} + +// Home credential dirs (resolved absolutely) still taint. +func TestDeriveProvenance_ReadFileHomeSecretTaints(t *testing.T) { + home, err := os.UserHomeDir() + if err != nil || home == "" { + t.Skip("no home dir") + } + secret := filepath.Join(home, ".ssh", "id_rsa") + prov := DeriveProvenance([]llm.Message{ + toolMsgArgs("read_file", `{"path":"`+secret+`"}`), + }) + if !prov.Untrusted { + t.Errorf("%s read must taint, got %+v", secret, prov) + } +} + +// Malformed / empty argument strings are treated conservatively (taint), +// since we cannot tell what path was touched. +func TestDeriveProvenance_ReadFileMalformedArgsTaints(t *testing.T) { + for _, args := range []string{"", "not json", "{"} { + prov := DeriveProvenance([]llm.Message{toolMsgArgs("read_file", args)}) + if !prov.Untrusted { + t.Errorf("malformed read_file args %q should conservatively taint, got %+v", args, prov) + } + } +} + +// Network / audio tools always taint regardless of arguments. +func TestDeriveProvenance_AlwaysExternalToolsTaint(t *testing.T) { + for _, name := range []string{"http_batch", "transcribe"} { + prov := DeriveProvenance([]llm.Message{toolMsgArgs(name, `{"path":"internal/x.go"}`)}) + if !prov.Untrusted { + t.Errorf("%s must always taint, got %+v", name, prov) + } + } +} + +// ToolCallTaints is the shared predicate used by both memory and skills. +func TestToolCallTaints(t *testing.T) { + cases := []struct { + name, args string + want bool + }{ + {"shell", `{"command":"ls"}`, false}, + {"write_file", `{"path":"/etc/x"}`, false}, // not a read tool + {"read_file", `{"path":"internal/x.go"}`, false}, + {"read_file", `{"path":"/etc/shadow"}`, true}, + {"read_file", ``, true}, + {"search_files", `{"pattern":"x"}`, false}, + {"browser", `{"url":"https://x"}`, true}, + {"github__list_issues", `{}`, true}, + } + for _, c := range cases { + if got := ToolCallTaints(c.name, c.args); got != c.want { + t.Errorf("ToolCallTaints(%q,%q) = %v, want %v", c.name, c.args, got, c.want) + } } } diff --git a/internal/skills/provenance_test.go b/internal/skills/provenance_test.go index 2fe222d..8076eba 100644 --- a/internal/skills/provenance_test.go +++ b/internal/skills/provenance_test.go @@ -13,6 +13,44 @@ func msgWithTool(name string) LlmMessage { } } +func msgWithToolArgs(name, argsJSON string) LlmMessage { + tc := LlmToolCall{} + tc.Function.Name = name + tc.Function.Arguments = argsJSON + return LlmMessage{ + Role: "assistant", + ToolCalls: []LlmToolCall{tc}, + } +} + +// Mirrors the memory fix: a skill learned from a session that only read +// workspace files must NOT be flagged for review. This also guards the +// llm.Message → LlmMessage conversion: if Arguments were dropped upstream, +// the path tool would read as empty-args and conservatively taint, failing +// this test. +func TestDeriveProvenance_WorkspaceReadTrusted(t *testing.T) { + msgs := []LlmMessage{ + msgWithTool("shell"), + msgWithToolArgs("read_file", `{"path":"internal/x.go"}`), + msgWithToolArgs("search_files", `{"pattern":"TODO"}`), + } + prov := DeriveProvenance(msgs) + if prov.Untrusted || prov.NeedsReview { + t.Errorf("workspace-only session should yield a trusted skill, got %+v", prov) + } +} + +func TestDeriveProvenance_SensitiveReadTaints(t *testing.T) { + msgs := []LlmMessage{msgWithToolArgs("read_file", `{"path":"/etc/passwd"}`)} + prov := DeriveProvenance(msgs) + if !prov.Untrusted || !prov.NeedsReview { + t.Fatalf("reading /etc/passwd should taint the skill, got %+v", prov) + } + if len(prov.Sources) != 1 || prov.Sources[0] != "read_file" { + t.Errorf("Sources = %v, want [read_file]", prov.Sources) + } +} + func TestDeriveProvenance_EmptyIsTrusted(t *testing.T) { prov := DeriveProvenance(nil) if prov.Untrusted { diff --git a/internal/skills/selfimprove.go b/internal/skills/selfimprove.go index 64c0557..f686652 100644 --- a/internal/skills/selfimprove.go +++ b/internal/skills/selfimprove.go @@ -600,32 +600,29 @@ func PassesQualityGate(s SkillSuggestion) bool { // ── Provenance derivation ───────────────────────────────────────────── -// untrustedToolNames is the canonical set defined in internal/memory. -// Referencing the same map ensures both packages stay in sync. -var untrustedToolNames = memory.UntrustedToolNames - // DeriveProvenance scans the session's tool calls and returns the trust // signals appropriate for any skill derived from it. A skill is marked // Untrusted (with NeedsReview = true) if any of the messages involved -// a tool whose output is sourced from external content the agent -// ingested. The sources list records which tools triggered the flag so -// the user can review what to inspect. +// a tool call that crossed the agent's trust boundary. The sources list +// records which tools triggered the flag so the user can review what to +// inspect. // -// We additionally treat any MCP tool call (name contains "__" — the -// adapter naming convention) as untrusted, since MCP servers return -// arbitrary text from third-party processes. +// The per-call decision is delegated to memory.ToolCallTaints — the single +// source of truth shared with episode provenance. That keeps the two systems +// in lockstep and makes both argument-aware: path-scoped reads (read_file, +// search_files, multi_grep) only taint when they touch a sensitive path, while +// network/MCP/audio tools always taint. func DeriveProvenance(messages []LlmMessage) SkillProvenance { prov := SkillProvenance{} seen := make(map[string]bool) for _, m := range messages { for _, tc := range m.ToolCalls { - name := tc.Function.Name - tainted := untrustedToolNames[name] || strings.Contains(name, "__") - if !tainted { + if !memory.ToolCallTaints(tc.Function.Name, tc.Function.Arguments) { continue } prov.Untrusted = true prov.NeedsReview = true + name := tc.Function.Name if !seen[name] { seen[name] = true prov.Sources = append(prov.Sources, name) From c550d2dd09e42a7375fc3926c8cbfd818cd6ed4c Mon Sep 17 00:00:00 2001 From: Rolando Santamaria Maso Date: Sat, 6 Jun 2026 09:57:24 +0200 Subject: [PATCH 2/3] fix(memory): close provenance bypasses found by verification (D-01/D-02/D-03/D-05) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remediates the findings from the AI Verification Protocol run on this PR. The v1 taint check used a sensitive-path *denylist* (danger.ClassifyPath) over only three tools, which left several bypasses: - D-03 (regression): macOS /etc and /var are symlinks to /private/etc, /private /var. ClassifyPath only matched /etc,/var, so read_file("/private/etc/...") classified local_write and did NOT taint — a sensitive read that tainted before this PR. Confirmed live. - D-01: sibling whole-file readers (batch_read, json_query, head_tail, sort, tr, diff, count_lines, checksum, word_count, file_info, glob, tree, base64) were in no taint set at all, so reading /etc/shadow via batch_read produced a TRUSTED, recallable episode. - D-02: home credential files outside ClassifyPath's hardcoded subdir list (e.g. ~/secrets.env) classified local_write and did not taint. - D-05: session_search re-surfaces prior-session transcripts but never tainted. Fix — flip the model from a sensitive-path denylist to a workspace-containment allowlist, and broaden coverage: - ToolCallTaints now taints a path-reading tool when ANY of its path arguments (path / path_a / path_b / files[].path) resolves OUTSIDE the workspace trust zone (cwd, the sandbox /workspace mount, or ~/.odek). Symlinks are resolved so /etc -> /private/etc can't disguise an escape. Workspace reads stay trusted (the original goal); everything else taints. Malformed args still taint conservatively; absent/empty path defaults to the workspace. - PathReadingTools expanded to every file-reading tool; session_search added to AlwaysExternalTools. Provenance no longer depends on ClassifyPath. - Defense in depth: ClassifyPath now strips the macOS /private prefix so /private/etc|var classify the same as /etc|var for its other callers (transcribe risk, write confinement). Also adds the missing manager-wrapper tests (PromoteEpisode / PendingReviewEpisodes were at 0% coverage — the report's coverage finding; now 100%) and updates docs/SECURITY.md to describe the containment model accurately (the report's doc-accuracy finding). Verified: batch_read/json_query/diff of /etc, read_file of /private/etc and ~/secrets.env, and session_search all taint; workspace reads (relative + abs) and the /workspace mount stay trusted. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/SECURITY.md | 4 +- internal/danger/classifier.go | 9 ++ internal/memory/episodes_promote_test.go | 32 +++++ internal/memory/provenance.go | 172 +++++++++++++++++------ internal/memory/provenance_test.go | 9 ++ 5 files changed, 181 insertions(+), 45 deletions(-) diff --git a/docs/SECURITY.md b/docs/SECURITY.md index ec258f3..d34db38 100644 --- a/docs/SECURITY.md +++ b/docs/SECURITY.md @@ -102,8 +102,8 @@ Both: Taint is decided per tool call by `memory.ToolCallTaints` (the single source of truth, shared with skills): -- **Always untrusted:** `browser`, `http_batch`, `transcribe` (network/opaque-audio content), and any MCP tool (`server__tool`). -- **Path-scoped:** `read_file`, `search_files`, `multi_grep` taint **only** when their `path` argument resolves to a sensitive location (`danger.ClassifyPath` → `system_write`/`destructive`, e.g. `/etc`, `~/.ssh`, `~/.aws`). Reads confined to the workspace — or any other non-sensitive local path — stay trusted, so ordinary coding sessions remain recallable. A malformed/absent path is treated conservatively as untrusted. +- **Always untrusted:** `browser`, `http_batch`, `transcribe` (network / opaque-audio content), `session_search` (recall of prior-session transcripts, which may carry earlier-injected text), and any MCP tool (`server__tool`). +- **Path-reading tools** (`read_file`, `search_files`, `multi_grep`, `batch_read`, `json_query`, `head_tail`, `count_lines`, `checksum`, `word_count`, `sort`, `tr`, `diff`, `file_info`, `glob`, `tree`, `base64`) taint when **any** of their path arguments resolves **outside the workspace trust zone** — the workspace dir, the sandbox `/workspace` mount, or `~/.odek`. Reads confined to the workspace stay trusted, so ordinary coding sessions remain recallable; reads of anything else (system/credential paths, home files, sibling repos) taint. The check is a workspace-containment allowlist rather than a sensitive-path denylist, and it resolves symlinks (so e.g. `/etc` → `/private/etc` on macOS cannot disguise an escape). A malformed argument string is treated conservatively as untrusted. When adding a new file-reading tool, add it to `PathReadingTools`. To use a tainted episode anyway, the user explicitly promotes it (sets `UserApproved=true`) from the CLI: diff --git a/internal/danger/classifier.go b/internal/danger/classifier.go index bb32ebc..5b73e62 100644 --- a/internal/danger/classifier.go +++ b/internal/danger/classifier.go @@ -145,6 +145,15 @@ func ClassifyPath(path string) RiskClass { } abs = filepath.Clean(abs) + // macOS canonicalizes /etc, /var, and /tmp as symlinks under /private. + // Strip the /private prefix so the sensitivity checks below match + // consistently — e.g. /private/etc/master.passwd must classify the same + // as /etc/master.passwd (system_write), and /private/var/folders/... must + // still resolve to the temp dir (local_write). + if strings.HasPrefix(abs, "/private/") { + abs = strings.TrimPrefix(abs, "/private") + } + for _, prefix := range []string{"/boot", "/dev", "/proc", "/sys", "/mnt", "/media"} { if strings.HasPrefix(abs, prefix) { return Destructive diff --git a/internal/memory/episodes_promote_test.go b/internal/memory/episodes_promote_test.go index 688e9e6..d138838 100644 --- a/internal/memory/episodes_promote_test.go +++ b/internal/memory/episodes_promote_test.go @@ -72,6 +72,38 @@ func TestEpisode_PromotePersists(t *testing.T) { } } +// TestMemoryManager_PromoteAndPending covers the manager-level wrappers +// (PromoteEpisode / PendingReviewEpisodes), including the disabled-memory guard. +func TestMemoryManager_PromoteAndPending(t *testing.T) { + dir := t.TempDir() + m := NewMemoryManager(dir, nil, MemoryConfig{Enabled: boolPtr(true)}) + if err := m.episodes.WriteWithProvenance("20260201-web", "researched X", 5, + EpisodeProvenance{Untrusted: true, Sources: []string{"browser"}}); err != nil { + t.Fatalf("seed: %v", err) + } + + pending, err := m.PendingReviewEpisodes() + if err != nil || len(pending) != 1 { + t.Fatalf("PendingReviewEpisodes = %v, err=%v; want 1", pending, err) + } + if err := m.PromoteEpisode("20260201-web"); err != nil { + t.Fatalf("PromoteEpisode: %v", err) + } + pending, _ = m.PendingReviewEpisodes() + if len(pending) != 0 { + t.Errorf("after promote, pending = %v, want empty", pending) + } + + // Disabled memory: both wrappers must error rather than touch the store. + off := NewMemoryManager(dir, nil, MemoryConfig{Enabled: boolPtr(false)}) + if err := off.PromoteEpisode("20260201-web"); err == nil { + t.Error("PromoteEpisode on disabled memory should error") + } + if _, err := off.PendingReviewEpisodes(); err == nil { + t.Error("PendingReviewEpisodes on disabled memory should error") + } +} + func TestEpisode_PromoteErrors(t *testing.T) { es := NewEpisodeStore(t.TempDir(), nil) if err := es.WriteWithProvenance("20260107-a", "s", 5, diff --git a/internal/memory/provenance.go b/internal/memory/provenance.go index d0b1129..2d4c040 100644 --- a/internal/memory/provenance.go +++ b/internal/memory/provenance.go @@ -2,9 +2,10 @@ package memory import ( "encoding/json" + "os" + "path/filepath" "strings" - "github.com/BackendStack21/odek/internal/danger" "github.com/BackendStack21/odek/internal/llm" ) @@ -13,12 +14,12 @@ import ( // // An untrusted episode is one whose originating session ingested // content from outside the agent's trust boundary (fetched pages, MCP -// tool output, audio transcription, or reads of sensitive system/ -// credential paths). Such episodes are stored on disk for audit but are -// NEVER auto-replayed into future sessions — they must be explicitly -// promoted (UserApproved=true) by the user first (see `odek memory -// promote`). This stops a one-shot prompt injection from becoming a -// persistent backdoor. +// tool output, audio transcription, prior-session recall, or reads of +// files outside the workspace). Such episodes are stored on disk for +// audit but are NEVER auto-replayed into future sessions — they must be +// explicitly promoted (UserApproved=true) by the user first (see +// `odek memory promote`). This stops a one-shot prompt injection from +// becoming a persistent backdoor. type EpisodeProvenance struct { Untrusted bool `json:"untrusted,omitempty"` Sources []string `json:"sources,omitempty"` @@ -26,36 +27,55 @@ type EpisodeProvenance struct { } // AlwaysExternalTools are tools whose RESULT content originates outside the -// agent's trust boundary regardless of their arguments: network fetches and -// opaque transcribed audio. A session that used any of these always produces -// an untrusted episode. +// agent's trust boundary regardless of their arguments: network fetches, +// opaque transcribed audio, and recall of prior-session transcripts (which may +// themselves carry previously-injected content). var AlwaysExternalTools = map[string]bool{ - "browser": true, - "http_batch": true, - "transcribe": true, + "browser": true, + "http_batch": true, + "transcribe": true, + "session_search": true, } -// PathScopedTools are local file tools that only cross the trust boundary when -// they touch a SENSITIVE path (system or credential locations). Reads confined -// to the workspace — or any other non-sensitive local path — are trusted, so -// ordinary coding sessions no longer taint their episode just for reading -// project files. The per-call decision lives in ToolCallTaints. -var PathScopedTools = map[string]bool{ +// PathReadingTools are tools that read filesystem content (or structure) into +// the transcript. They taint the episode only when one of their path arguments +// resolves OUTSIDE the workspace (see pathReadEscapes) — reads confined to the +// workspace, or to odek's own ~/.odek state, stay trusted so ordinary coding +// sessions remain recallable. +// +// This must list every tool that surfaces file contents/structure to the +// model. A tool missing here would let an injected agent read a secret into a +// TRUSTED, recallable episode; when adding a new file-reading tool, add it +// here too. +var PathReadingTools = map[string]bool{ "read_file": true, "search_files": true, "multi_grep": true, + "batch_read": true, + "json_query": true, + "head_tail": true, + "count_lines": true, + "checksum": true, + "word_count": true, + "sort": true, + "tr": true, + "diff": true, + "file_info": true, + "glob": true, + "tree": true, + "base64": true, } // UntrustedToolNames is the union of the two categories above. It is retained // as the canonical "these tools can produce untrusted content" set for // external references and documentation. The actual per-call decision is made -// by ToolCallTaints, which is argument-aware for the path-scoped tools. +// by ToolCallTaints, which is argument-aware for the path-reading tools. var UntrustedToolNames = func() map[string]bool { - m := make(map[string]bool, len(AlwaysExternalTools)+len(PathScopedTools)) + m := make(map[string]bool, len(AlwaysExternalTools)+len(PathReadingTools)) for k := range AlwaysExternalTools { m[k] = true } - for k := range PathScopedTools { + for k := range PathReadingTools { m[k] = true } return m @@ -68,10 +88,11 @@ var UntrustedToolNames = func() map[string]bool { // - MCP adapter calls (name contains "__") always taint — third-party servers // return arbitrary text. // - AlwaysExternalTools always taint, regardless of arguments. -// - PathScopedTools taint only when their "path" argument resolves to a -// sensitive location (danger.ClassifyPath → SystemWrite/Destructive). An -// empty path means the tool defaults to the workspace (trusted); a -// malformed argument string is treated conservatively as tainting. +// - PathReadingTools taint only when one of their path arguments resolves +// OUTSIDE the workspace trust zone (workspace dir, the sandbox /workspace +// mount, or ~/.odek). Symlinks are resolved so e.g. /etc → /private/etc on +// macOS cannot disguise an escape. A malformed argument string taints +// conservatively; absent/empty paths default to the workspace (trusted). // - Everything else (shell, patch, write_file, …) is trusted. func ToolCallTaints(name, argsJSON string) bool { if strings.Contains(name, "__") { @@ -80,33 +101,98 @@ func ToolCallTaints(name, argsJSON string) bool { if AlwaysExternalTools[name] { return true } - if PathScopedTools[name] { - return pathArgIsSensitive(argsJSON) + if PathReadingTools[name] { + return pathReadEscapes(argsJSON) } return false } -// pathArgIsSensitive extracts the "path" argument of a path-scoped tool call -// and reports whether it points outside the trust boundary. All three -// path-scoped tools (read_file, search_files, multi_grep) use the "path" key. -func pathArgIsSensitive(argsJSON string) bool { +// pathReadEscapes extracts every filesystem path argument from a path-reading +// tool call and reports whether any of them resolves outside the workspace +// trust zone. The known path-bearing argument shapes across odek's file tools +// are: "path", "path_a"/"path_b" (diff), and a "files":[{"path":…}] array +// (batch_read, head_tail, count_lines, checksum, word_count, sort). +func pathReadEscapes(argsJSON string) bool { var a struct { - Path string `json:"path"` + Path string `json:"path"` + PathA string `json:"path_a"` + PathB string `json:"path_b"` + Files []struct { + Path string `json:"path"` + } `json:"files"` } if err := json.Unmarshal([]byte(argsJSON), &a); err != nil { - return true // can't determine the path → assume the worst + return true // can't determine the paths → assume the worst } - if strings.TrimSpace(a.Path) == "" { - return false // empty path defaults to the workspace — trusted + + roots := trustedRoots() + candidates := []string{a.Path, a.PathA, a.PathB} + for _, f := range a.Files { + candidates = append(candidates, f.Path) } - switch danger.ClassifyPath(a.Path) { - case danger.SystemWrite, danger.Destructive: - return true - default: - // LocalWrite (in-workspace, /tmp, and other non-sensitive local - // paths) and anything else is within the trust boundary. - return false + + sawPath := false + for _, p := range candidates { + if strings.TrimSpace(p) == "" { + continue + } + sawPath = true + if pathOutsideRoots(p, roots) { + return true + } + } + // No path argument at all → the tool defaulted to the workspace (trusted). + _ = sawPath + return false +} + +// trustedRoots returns the set of directory prefixes within which a file read +// is considered inside the trust boundary: the current workspace (process cwd +// at session-end), the conventional sandbox mount "/workspace", and odek's own +// ~/.odek state directory. Each is included both as-is and symlink-resolved. +func trustedRoots() []string { + var roots []string + add := func(p string) { + if p == "" { + return + } + c := filepath.Clean(p) + roots = append(roots, c) + if r, err := filepath.EvalSymlinks(c); err == nil && r != c { + roots = append(roots, r) + } + } + if cwd, err := os.Getwd(); err == nil { + add(cwd) + } + add("/workspace") // sandbox mount point (see internal/sandbox) + if home, err := os.UserHomeDir(); err == nil { + add(filepath.Join(home, ".odek")) + } + return roots +} + +// pathOutsideRoots reports whether p resolves outside every trusted root. +// The path is checked both as filepath.Abs(p) and symlink-resolved, so a +// symlinked sensitive path (e.g. /etc → /private/etc) cannot evade detection. +func pathOutsideRoots(p string, roots []string) bool { + abs, err := filepath.Abs(p) + if err != nil { + return true // unresolvable → conservative + } + abs = filepath.Clean(abs) + cands := []string{abs} + if r, err := filepath.EvalSymlinks(abs); err == nil && r != abs { + cands = append(cands, r) + } + for _, c := range cands { + for _, root := range roots { + if c == root || strings.HasPrefix(c, root+string(filepath.Separator)) { + return false // inside a trusted root + } + } } + return true } // DeriveProvenance walks a session's structured messages and returns diff --git a/internal/memory/provenance_test.go b/internal/memory/provenance_test.go index ce098fe..92bf881 100644 --- a/internal/memory/provenance_test.go +++ b/internal/memory/provenance_test.go @@ -149,8 +149,17 @@ func TestToolCallTaints(t *testing.T) { {"write_file", `{"path":"/etc/x"}`, false}, // not a read tool {"read_file", `{"path":"internal/x.go"}`, false}, {"read_file", `{"path":"/etc/shadow"}`, true}, + {"read_file", `{"path":"/private/etc/master.passwd"}`, true}, // macOS /private symlink {"read_file", ``, true}, {"search_files", `{"pattern":"x"}`, false}, + {"read_file", `{"path":"/workspace/foo.go"}`, false}, // sandbox mount is trusted + // Broadened file-reading tool coverage (was the D-01 bypass): + {"batch_read", `{"files":[{"path":"/etc/shadow"}]}`, true}, + {"batch_read", `{"files":[{"path":"internal/x.go"}]}`, false}, + {"json_query", `{"path":"/etc/passwd"}`, true}, + {"diff", `{"path_a":"/etc/hosts","path_b":"a.txt"}`, true}, // any path escaping taints + {"count_lines", `{"files":[{"path":"go.mod"}]}`, false}, + {"session_search", `{"query":"password"}`, true}, // recall of prior transcripts {"browser", `{"url":"https://x"}`, true}, {"github__list_issues", `{}`, true}, } From 2aa92bbc3304a98936b880bb50c4773a34f36bfe Mon Sep 17 00:00:00 2001 From: Rolando Santamaria Maso Date: Sat, 6 Jun 2026 10:07:14 +0200 Subject: [PATCH 3/3] feat(memory): optional auto-approval of episode learnings (default off) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `memory.auto_approve_episodes` (default false). When enabled, untrusted episodes are stamped approved at session end so they are recalled without a manual `odek memory promote` — an explicit opt-in that trades the human review gate for convenience in trusted/sandboxed deployments. - New EpisodeProvenance.AutoApproved field, kept DISTINCT from UserApproved so the audit trail shows the approval was automatic (policy), while Untrusted + Sources are still recorded. Episode Search and PendingReview treat AutoApproved as approved. - The stamp is applied at write time in OnSessionEndWithProvenance only when the flag is on and the episode is untrusted; trusted episodes are unaffected. - Config plumbed through MemoryConfig (default false), NewMemoryManager merge, and resolveMemory. - docs/CONFIG.md and docs/SECURITY.md document it as a security trade-off that disables the persistence-injection protection for episodes when on. Tests: store-level recall of an AutoApproved episode + exclusion from pending; the secure default is false; and the full OnSessionEnd stamping path with the flag on (AutoApproved set, recallable, not pending) vs off (excluded, pending). Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/CONFIG.md | 4 +- docs/SECURITY.md | 2 + internal/config/loader.go | 35 ++++++------ internal/memory/episodes.go | 4 +- internal/memory/episodes_promote_test.go | 72 ++++++++++++++++++++++++ internal/memory/memory.go | 19 +++++++ internal/memory/provenance.go | 8 +++ 7 files changed, 125 insertions(+), 19 deletions(-) diff --git a/docs/CONFIG.md b/docs/CONFIG.md index 97e42bf..e9dcf6b 100644 --- a/docs/CONFIG.md +++ b/docs/CONFIG.md @@ -200,7 +200,8 @@ The `memory` section controls the persistent memory system (see [docs/MEMORY.md] "llm_extract": true, "llm_consolidate": true, "merge_threshold": 0.7, - "add_threshold": 0.3 + "add_threshold": 0.3, + "auto_approve_episodes": false } } ``` @@ -219,6 +220,7 @@ The `memory` section controls the persistent memory system (see [docs/MEMORY.md] | `llm_consolidate` | true | Use LLM to merge related fact entries | | `merge_threshold` | 0.7 | go-vector cosine threshold for auto-merge (0.0–1.0) | | `add_threshold` | 0.3 | go-vector cosine threshold for auto-add (0.0–1.0) | +| `auto_approve_episodes` | false | **Security trade-off.** When true, untrusted episodes (sessions that touched web/MCP/out-of-workspace content) are auto-approved at session end so they are recalled without a manual `odek memory promote`. Leaving it `false` keeps the human review gate (recommended). | ## Sub-agent configuration diff --git a/docs/SECURITY.md b/docs/SECURITY.md index d34db38..e25e4c5 100644 --- a/docs/SECURITY.md +++ b/docs/SECURITY.md @@ -114,6 +114,8 @@ odek memory promote # approve one after reviewing its summary Promotion is **CLI-only and human-gated** — it is deliberately *not* exposed as an agent tool, so a prompt-injected agent cannot self-approve its own poisoned memory. +**Opt-out of the gate (`memory.auto_approve_episodes`, default `false`).** Operators who accept the risk (e.g. a fully sandboxed, single-tenant deployment) can set `auto_approve_episodes: true` to have untrusted episodes stamped `AutoApproved` at session end so they are recalled without a manual promote. This **disables the persistence-injection protection** for episodes — a single successful injection can then influence future sessions automatically — so it is off by default and should stay off in any environment exposed to untrusted input. The on-disk record still keeps `Untrusted=true` and `Sources`, and uses a distinct `AutoApproved` flag (never `UserApproved`) so the audit trail shows the approval was automatic. + ### 6. Skill provenance gate `internal/skills` carries the same provenance model and shares the exact taint decision (`memory.ToolCallTaints`). Skills auto-saved from sessions that crossed the trust boundary — `browser` / `http_batch` / `transcribe` / any MCP tool, or a `read_file` / `search_files` / `multi_grep` of a **sensitive** path — are tagged with `Provenance.Untrusted=true` and `NeedsReview=true`. The skill loader pins those skills to the Lazy set regardless of their `auto_load` flag. diff --git a/internal/config/loader.go b/internal/config/loader.go index 85b8068..e63742b 100644 --- a/internal/config/loader.go +++ b/internal/config/loader.go @@ -649,22 +649,22 @@ func LoadConfig(cli CLIFlags) ResolvedConfig { MaxIter: cfg.MaxIter, System: cfg.System, - SandboxImage: cfg.SandboxImage, // empty = resolve at call site (Dockerfile.odek or alpine:latest) - SandboxNetwork: ifZero(cfg.SandboxNetwork, DefaultSandboxNetwork), - SandboxMemory: cfg.SandboxMemory, - SandboxCPUs: cfg.SandboxCPUs, - SandboxUser: cfg.SandboxUser, - SandboxEnv: cfg.SandboxEnv, - SandboxVolumes: cfg.SandboxVolumes, - Skills: resolveSkills(cfg.Skills), - Dangerous: resolveDangerous(cfg.Dangerous), - Memory: resolveMemory(cfg.Memory), - MCPServers: cfg.MCPServers, - Telegram: resolveTelegram(cfg.Telegram), - Transcription: resolveTranscription(cfg.Transcription), - Schedules: resolveSchedules(cfg.Schedules), - InteractionMode: ifZero(cfg.InteractionMode, "engaging"), - ToolProgress: ifZero(cfg.ToolProgress, "all"), + SandboxImage: cfg.SandboxImage, // empty = resolve at call site (Dockerfile.odek or alpine:latest) + SandboxNetwork: ifZero(cfg.SandboxNetwork, DefaultSandboxNetwork), + SandboxMemory: cfg.SandboxMemory, + SandboxCPUs: cfg.SandboxCPUs, + SandboxUser: cfg.SandboxUser, + SandboxEnv: cfg.SandboxEnv, + SandboxVolumes: cfg.SandboxVolumes, + Skills: resolveSkills(cfg.Skills), + Dangerous: resolveDangerous(cfg.Dangerous), + Memory: resolveMemory(cfg.Memory), + MCPServers: cfg.MCPServers, + Telegram: resolveTelegram(cfg.Telegram), + Transcription: resolveTranscription(cfg.Transcription), + Schedules: resolveSchedules(cfg.Schedules), + InteractionMode: ifZero(cfg.InteractionMode, "engaging"), + ToolProgress: ifZero(cfg.ToolProgress, "all"), } // MaxConcurrency: default to 3 if not set @@ -850,6 +850,9 @@ func resolveMemory(cfg *memory.MemoryConfig) memory.MemoryConfig { if cfg.MinTurnsForExtraction > 0 { def.MinTurnsForExtraction = cfg.MinTurnsForExtraction } + if cfg.AutoApproveEpisodes != nil { + def.AutoApproveEpisodes = cfg.AutoApproveEpisodes + } return def } diff --git a/internal/memory/episodes.go b/internal/memory/episodes.go index c1b2b33..5fe0837 100644 --- a/internal/memory/episodes.go +++ b/internal/memory/episodes.go @@ -217,7 +217,7 @@ func (e *EpisodeStore) Search(query string, limit int) ([]EpisodeMeta, error) { // EpisodeProvenance exists to close. filtered := idx[:0:len(idx)] for _, ep := range idx { - if ep.Provenance.Untrusted && !ep.Provenance.UserApproved { + if ep.Provenance.Untrusted && !ep.Provenance.UserApproved && !ep.Provenance.AutoApproved { continue } filtered = append(filtered, ep) @@ -290,7 +290,7 @@ func (e *EpisodeStore) PendingReview() ([]EpisodeMeta, error) { } var pending []EpisodeMeta for _, ep := range idx { - if ep.Provenance.Untrusted && !ep.Provenance.UserApproved { + if ep.Provenance.Untrusted && !ep.Provenance.UserApproved && !ep.Provenance.AutoApproved { pending = append(pending, ep) } } diff --git a/internal/memory/episodes_promote_test.go b/internal/memory/episodes_promote_test.go index d138838..406ba9b 100644 --- a/internal/memory/episodes_promote_test.go +++ b/internal/memory/episodes_promote_test.go @@ -2,6 +2,78 @@ package memory import "testing" +// ── Auto-approve episode learnings (opt-in, off by default) ─────────── + +// TestEpisode_AutoApprovedIsRecallable: an episode stamped AutoApproved is +// recalled by Search and is not listed as pending — same effect as a human +// promote, but recorded as automatic. +func TestEpisode_AutoApprovedIsRecallable(t *testing.T) { + es := NewEpisodeStore(t.TempDir(), nil) + if err := es.WriteWithProvenance("20260301-auto", "auto-approved external research", 5, + EpisodeProvenance{Untrusted: true, AutoApproved: true, Sources: []string{"browser"}}); err != nil { + t.Fatalf("write: %v", err) + } + res, err := es.Search("any", 10) + if err != nil { + t.Fatalf("search: %v", err) + } + if len(res) != 1 || res[0].SessionID != "20260301-auto" { + t.Errorf("auto-approved episode should be recalled, got %v", res) + } + pending, _ := es.PendingReview() + if len(pending) != 0 { + t.Errorf("auto-approved episode should not be pending, got %v", pending) + } +} + +// TestDefaultMemoryConfig_AutoApproveOff: the secure default is false. +func TestDefaultMemoryConfig_AutoApproveOff(t *testing.T) { + d := DefaultMemoryConfig() + if d.AutoApproveEpisodes == nil || *d.AutoApproveEpisodes { + t.Errorf("AutoApproveEpisodes default should be false, got %v", d.AutoApproveEpisodes) + } +} + +// TestOnSessionEnd_AutoApproveStamping: with the flag on, an untrusted episode +// extracted at session end is stamped AutoApproved (not UserApproved) and is +// recallable; with the flag off (default) it stays pending/excluded. +func TestOnSessionEnd_AutoApproveStamping(t *testing.T) { + llm := &mockLLM{responses: map[string]string{"Summarize": "researched a library online"}} + msgs := []string{"user: hi", "assistant: ok", "user: go", "assistant: done"} + prov := EpisodeProvenance{Untrusted: true, Sources: []string{"browser"}} + + // Flag ON → auto-approved + recallable. + on := DefaultMemoryConfig() + on.AutoApproveEpisodes = boolPtr(true) + mOn := NewMemoryManager(t.TempDir(), llm, on) + mOn.OnSessionEndWithProvenance("20260303-on", 5, msgs, prov) + + idx, err := mOn.episodes.ReadIndex() + if err != nil || len(idx) != 1 { + t.Fatalf("expected 1 episode, got %v err=%v", idx, err) + } + p := idx[0].Provenance + if !p.Untrusted || !p.AutoApproved || p.UserApproved { + t.Errorf("flag-on episode provenance = %+v; want Untrusted+AutoApproved, not UserApproved", p) + } + if res, _ := mOn.SearchEpisodes("any", 10); len(res) != 1 { + t.Errorf("flag-on episode should be recallable, got %v", res) + } + if pend, _ := mOn.PendingReviewEpisodes(); len(pend) != 0 { + t.Errorf("flag-on episode should not be pending, got %v", pend) + } + + // Flag OFF (default) → stays untrusted, excluded, pending. + mOff := NewMemoryManager(t.TempDir(), llm, DefaultMemoryConfig()) + mOff.OnSessionEndWithProvenance("20260304-off", 5, msgs, prov) + if res, _ := mOff.SearchEpisodes("any", 10); len(res) != 0 { + t.Errorf("flag-off untrusted episode must be excluded from recall, got %v", res) + } + if pend, _ := mOff.PendingReviewEpisodes(); len(pend) != 1 { + t.Errorf("flag-off untrusted episode should be pending, got %v", pend) + } +} + // TestEpisode_PromoteMakesRecallable is the escape-hatch test: a tainted // episode is excluded from recall and listed as pending; after Promote it is // recallable and no longer pending. diff --git a/internal/memory/memory.go b/internal/memory/memory.go index 29bb764..01cf326 100644 --- a/internal/memory/memory.go +++ b/internal/memory/memory.go @@ -37,6 +37,13 @@ type MemoryConfig struct { MergeThreshold float32 `json:"merge_threshold,omitempty"` AddThreshold float32 `json:"add_threshold,omitempty"` MinTurnsForExtraction int `json:"min_turns_for_extraction,omitempty"` + + // AutoApproveEpisodes, when true, stamps untrusted episodes as approved at + // session-end so they are recalled without a manual `odek memory promote`. + // SECURITY: this is the opt-in escape valve that trades the human review + // gate for convenience — a session that ingested external/untrusted content + // can then influence future sessions automatically. Off (false) by default. + AutoApproveEpisodes *bool `json:"auto_approve_episodes,omitempty"` } // BoolPtr returns a pointer to a bool value. @@ -60,6 +67,7 @@ func DefaultMemoryConfig() MemoryConfig { MergeThreshold: MergeThreshold, AddThreshold: AddThreshold, MinTurnsForExtraction: defaultMinTurnsForExtraction, + AutoApproveEpisodes: boolPtr(false), // secure default — human gate stays on } } @@ -134,6 +142,9 @@ func NewMemoryManager(memoryDir string, llc LLMClient, cfg MemoryConfig) *Memory if cfg.MinTurnsForExtraction > 0 { def.MinTurnsForExtraction = cfg.MinTurnsForExtraction } + if cfg.AutoApproveEpisodes != nil { + def.AutoApproveEpisodes = cfg.AutoApproveEpisodes + } cfg = def factsDir := memoryDir @@ -490,6 +501,14 @@ func (m *MemoryManager) OnSessionEndWithProvenance(sessionID string, turns int, return } + // Opt-in auto-approval: stamp untrusted episodes as approved so they are + // recalled without a manual `odek memory promote`. Off by default; the + // audit record keeps Untrusted + Sources so it stays clear the content was + // external and the approval was automatic (AutoApproved, not UserApproved). + if prov.Untrusted && m.cfg.AutoApproveEpisodes != nil && *m.cfg.AutoApproveEpisodes { + prov.AutoApproved = true + } + // Write as episode, preserving the provenance signal. m.episodes.WriteIfEnoughWithProvenance(sessionID, extraction, turns, prov) } diff --git a/internal/memory/provenance.go b/internal/memory/provenance.go index 2d4c040..de0df1a 100644 --- a/internal/memory/provenance.go +++ b/internal/memory/provenance.go @@ -20,10 +20,18 @@ import ( // explicitly promoted (UserApproved=true) by the user first (see // `odek memory promote`). This stops a one-shot prompt injection from // becoming a persistent backdoor. +// +// AutoApproved is the opt-in escape valve: when the operator sets +// memory.auto_approve_episodes=true, untrusted episodes are stamped +// AutoApproved at creation so they are recalled without a manual promote. +// It is kept distinct from UserApproved so the audit trail still shows the +// approval was automatic (policy) rather than a human decision; Untrusted and +// Sources remain recorded either way. type EpisodeProvenance struct { Untrusted bool `json:"untrusted,omitempty"` Sources []string `json:"sources,omitempty"` UserApproved bool `json:"user_approved,omitempty"` + AutoApproved bool `json:"auto_approved,omitempty"` } // AlwaysExternalTools are tools whose RESULT content originates outside the