mattmezza · mattmezza · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026
diff --git a/api/admin.py b/api/admin.py
@@ -1300,6 +1300,7 @@ async def _bool(key: str, default: str) -> str:
             "emb_model": await _cfg("memory.embedding.model", "BAAI/bge-small-en-v1.5"),
             "emb_base_url": await _cfg("memory.embedding.base_url", ""),
             "emb_top_k": await _cfg("memory.embedding.injection_top_k", "12"),
+            "emb_recall_top_k": await _cfg("memory.embedding.recall_top_k", "10"),
             "hygiene_enabled": await _bool("memory.hygiene_enabled", "true"),
             "default_importance": await _cfg("memory.default_importance", "5.0"),
             "archive_after_days": await _cfg("memory.archive_after_days", "90"),
@@ -1656,6 +1657,7 @@ async def patch_config(body: ConfigPatchIn) -> dict:
                 # Tier 3/4 lifecycle knobs so memory config changes apply live.
                 agent.memory.embedder = agent._build_embedder()
                 agent.memory.injection_top_k = mem_cfg.embedding.injection_top_k
+                agent.memory.recall_top_k = mem_cfg.embedding.recall_top_k
                 agent.memory.default_importance = mem_cfg.default_importance
                 agent.memory.archive_after_days = mem_cfg.archive_after_days
                 agent.memory.archive_max_importance = mem_cfg.archive_max_importance
@@ -3322,8 +3324,10 @@ def _config_requires_restart(values: dict) -> bool:
 
 # Function-tools that a persona may scope. ``load_skill`` is intentionally
 # excluded — it is always available (the core mechanic personae use to read
-# their allowlisted skills). Kept here (not imported from core.agent) to avoid
-# pulling the agent's heavy import graph into the admin app.
+# their allowlisted skills); so are the vault tools and ``recall_memory``
+# (memory is injected for every persona, scope-filtered, so its on-demand
+# counterpart is always available too). Kept here (not imported from core.agent)
+# to avoid pulling the agent's heavy import graph into the admin app.
 GATEABLE_TOOLS = [
     "run_command",
     "send_email",

diff --git a/api/templates/partials/memory.html b/api/templates/partials/memory.html
@@ -9,6 +9,7 @@
     embModel: {{ emb_model|default('BAAI/bge-small-en-v1.5', true)|tojson|forceescape }},
     embBaseUrl: {{ emb_base_url|default('', true)|tojson|forceescape }},
     embTopK: {{ emb_top_k|default('12', true)|tojson|forceescape }},
+    embRecallTopK: {{ emb_recall_top_k|default('10', true)|tojson|forceescape }},
     embResult: '', embOk: false,
     embStatus: null, embBusy: false, embTestResult: '',
     // Lifecycle (Tier 3/4)
@@ -151,6 +152,12 @@ <h2 class="text-base mb-1">Semantic memory (embeddings)</h2>
         <p class="text-muted text-xs mt-1">How many of the most relevant long-term memories to put in the prompt each message.</p>
       </div>
 
+      <div>
+        <label class="label">Recall results (recall_memory tool)</label>
+        <input type="number" class="input-sm" style="max-width:120px" x-model="embRecallTopK" :disabled="!embEnabled" min="1" max="25">
+        <p class="text-muted text-xs mt-1">Default number of memories the recall_memory tool returns when the agent searches its full long-term store on demand (the agent may request fewer/more per call, capped at 25).</p>
+      </div>
+
       <div class="text-xs" x-show="isLocal && embStatus">
         Model on disk:
         <span x-show="embStatus && embStatus.model_ready" class="text-success">yes ✓</span>
@@ -165,7 +172,8 @@ <h2 class="text-base mb-1">Semantic memory (embeddings)</h2>
                 'memory.embedding.provider': embProvider,
                 'memory.embedding.model': embModel,
                 'memory.embedding.base_url': embBaseUrl,
-                'memory.embedding.injection_top_k': String(embTopK)
+                'memory.embedding.injection_top_k': String(embTopK),
+                'memory.embedding.recall_top_k': String(embRecallTopK)
               }, 'Embedding settings saved', 'embOk', 'embResult').then(loadEmbStatus)">
         Save embedding settings
       </button>

diff --git a/core/agent.py b/core/agent.py
@@ -186,6 +186,31 @@ def _shell_quote(s: str) -> str:
             "required": ["name"],
         },
     },
+    {
+        "name": "recall_memory",
+        "description": (
+            "Search your FULL long-term memory by meaning for facts about the user that "
+            "aren't already shown to you. Only the few most-relevant memories are injected "
+            "into each turn; call this when you suspect a relevant stored fact exists beyond "
+            "them — it searches the whole store, including older archived memories, and ranks "
+            "matches by relevance. Pass a natural-language query describing the fact you're "
+            "after (e.g. 'dietary restrictions and food allergies'), not just keywords."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "query": {
+                    "type": "string",
+                    "description": "Natural-language description of the fact(s) to recall",
+                },
+                "limit": {
+                    "type": "integer",
+                    "description": "Max memories to return (default 10).",
+                },
+            },
+            "required": ["query"],
+        },
+    },
     {
         "name": "manage_jobs",
         "description": (
@@ -374,7 +399,9 @@ def scoped_tools(persona: Persona | None) -> list[dict]:
         return TOOLS
     # ``load_skill`` and the vault discovery/request tools are always retained:
     # they are the mechanics personae rely on to read skills and obtain secrets.
-    _always = {"load_skill", "list_secrets", "request_secret"}
+    # ``recall_memory`` too — memory is injected for every persona (scope-filtered),
+    # so its on-demand counterpart exposes nothing extra and stays available (#47).
+    _always = {"load_skill", "recall_memory", "list_secrets", "request_secret"}
     return [t for t in TOOLS if persona.allows_tool(t["name"]) or t["name"] in _always]
 
 
@@ -419,6 +446,7 @@ def __init__(self, config: Config, secret_store: SecretStore | None = None):
             long_term_limit=mem_cfg.long_term_limit,
             embedder=self._build_embedder(),
             injection_top_k=mem_cfg.embedding.injection_top_k,
+            recall_top_k=mem_cfg.embedding.recall_top_k,
             default_importance=mem_cfg.default_importance,
             archive_after_days=mem_cfg.archive_after_days,
             archive_max_importance=mem_cfg.archive_max_importance,
@@ -1206,6 +1234,9 @@ async def _execute_tool(
                 return {"error": f"Skill not found: {skill_name}"}
             return {"name": skill_name, "content": content}
 
+        if name == "recall_memory":
+            return await self._tool_recall_memory(params, request_state)
+
         if name == "manage_jobs":
             log.info("Tool call: manage_jobs — %s", params.get("action", ""))
             result = await self._tool_manage_jobs(params)
@@ -1602,6 +1633,28 @@ async def _tool_manage_jobs(self, params: dict) -> dict:
 
         return {"error": f"Unknown action: {action!r}. Use 'create', 'list', or 'cancel'."}
 
+    async def _tool_recall_memory(self, params: dict, request_state: dict | None = None) -> dict:
+        """Deliberate semantic search over the full long-term memory store (#47).
+
+        Scoped to the active persona (#42): ``persona_name`` on the per-turn
+        request state is the persona's private memory scope (``""`` = the default
+        identity's shared-only view), so recall never crosses into another
+        persona's private memories — same boundary the injection readers enforce.
+        """
+        query = str(params.get("query", "")).strip()
+        if not query:
+            return {"error": "Missing 'query'."}
+        limit = params.get("limit")
+        limit = limit if isinstance(limit, int) and not isinstance(limit, bool) else None
+        scope = (request_state or {}).get("persona_name") or ""
+        try:
+            memories = await self.memory.recall(query, limit, scope=scope)
+        except Exception:
+            log.exception("recall_memory failed for query: %s", query)
+            return {"error": "Memory recall failed."}
+        log.info("Tool call: recall_memory — %r (%d hits)", query, len(memories))
+        return {"query": query, "count": len(memories), "memories": memories}
+
     async def _tool_web_search(self, params: dict) -> dict:
         """Search the web via Tavily API."""
         if not self.search_client:

diff --git a/core/config.py b/core/config.py
@@ -181,6 +181,7 @@ class EmbeddingConfig(BaseModel):
     base_url: str = ""  # API providers only; falls back to the agent provider base URL when empty
     dimensions: int = 0  # 0 = provider default (API providers only)
     injection_top_k: int = 12  # relevance-ranked memories injected per turn
+    recall_top_k: int = 10  # max memories returned by the recall_memory tool (full-store lookup)
 
 
 class MemoryConfig(BaseModel):

diff --git a/core/memory.py b/core/memory.py
@@ -492,6 +492,7 @@ def __init__(
         *,
         embedder: EmbeddingClient | None = None,
         injection_top_k: int = 12,
+        recall_top_k: int = 10,
         default_importance: float = 5.0,
         archive_after_days: int = 90,
         archive_max_importance: float = 4.0,
@@ -503,6 +504,7 @@ def __init__(
         self.long_term_limit = long_term_limit
         self.embedder = embedder
         self.injection_top_k = injection_top_k
+        self.recall_top_k = recall_top_k
         self.default_importance = default_importance
         self.archive_after_days = archive_after_days
         self.archive_max_importance = archive_max_importance
@@ -628,15 +630,90 @@ async def get_relevant_long_term(self, query: str, scope: str | None = None) ->
             for r in top
         ]
 
-    async def _reinforce(self, ids: list[int]) -> None:
-        """Strengthen recalled memories: bump access_count and last_accessed."""
+    # Upper bound on how many memories one recall_memory call may return, and the
+    # minimum relevance a row needs to be worth returning at all.
+    _RECALL_MAX_LIMIT = 25
+    # ponytail: relevance floor — raise to cut noise, lower to surface more long tail.
+    _RECALL_MIN_RELEVANCE = 0.1
+
+    async def recall(
+        self, query: str, limit: int | None = None, scope: str | None = None
+    ) -> list[dict]:
+        """Deliberate semantic search over the FULL long-term store (issue #47).
+
+        This is the agent's on-demand recall tool — the complement to the
+        always-injected top-k (:meth:`get_relevant_long_term`). Where injection
+        ranks only *non-archived* rows by a recency+importance+relevance blend
+        and is capped to a small per-turn budget, recall searches *every*
+        long-term memory (archived included), ranks purely by semantic relevance
+        to *query*, and returns the best matches above a small relevance floor.
+
+        Recalling reinforces the returned rows and un-archives any that had been
+        archived (the agent looked them up and they matched, so they are warm
+        again). Falls back to lexical token overlap when embeddings are
+        unavailable or the query can't be embedded, so recall always works.
+
+        ``scope`` filters per #42 (see :func:`_scope_filter`) exactly like the
+        injection readers, so a persona only recalls shared + its own private
+        memories, never another persona's; ``None`` = every scope.
+        """
+        query = (query or "").strip()
+        if not query:
+            return []
+        limit = self.recall_top_k if not limit or limit < 1 else limit
+        limit = min(limit, self._RECALL_MAX_LIMIT)
+
+        await self._ensure_schema()
+        clause, params = _scope_filter(scope)
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            cursor = await db.execute(
+                "SELECT id, category, subject, content, embedding, archived "  # noqa: S608
+                f"FROM long_term WHERE 1=1{clause}",
+                params,
+            )
+            rows = [dict(r) for r in await cursor.fetchall()]
+        if not rows:
+            return []
+
+        # Embedding cosine for rows with a matching-dim vector; lexical for the rest.
+        query_vec = await self._safe_embed(query)
+        rel_map = _batch_relevance(query_vec, rows)
+        query_tokens = _tokens(query)
+
+        scored: list[tuple[float, dict]] = []
+        for i, row in enumerate(rows):
+            if i in rel_map:
+                relevance = rel_map[i]
+            else:
+                relevance = _similarity(query_tokens, _tokens(f"{row['subject']} {row['content']}"))
+            if relevance >= self._RECALL_MIN_RELEVANCE:
+                scored.append((relevance, row))
+
+        scored.sort(key=lambda pair: pair[0], reverse=True)
+        top = [row for _, row in scored[:limit]]
+        # Recall revives matches: reinforce + un-archive. The returned rows are
+        # therefore all non-archived now, so no archived flag is surfaced.
+        await self._reinforce([row["id"] for row in top], unarchive=True)
+        return [
+            {"category": r["category"], "subject": r["subject"], "content": r["content"]}
+            for r in top
+        ]
+
+    async def _reinforce(self, ids: list[int], *, unarchive: bool = False) -> None:
+        """Strengthen recalled memories: bump access_count and last_accessed.
+
+        With *unarchive*, also clear the archived flag — a memory the agent
+        deliberately recalled and used is demonstrably warm again (issue #47).
+        """
         if not ids:
             return
         await self._ensure_schema()
+        archived_clause = ", archived = 0" if unarchive else ""
         async with aiosqlite.connect(self.db_path) as db:
             await db.executemany(
-                "UPDATE long_term SET access_count = access_count + 1, "
-                "last_accessed = datetime('now') WHERE id = ?",
+                "UPDATE long_term SET access_count = access_count + 1, "  # noqa: S608
+                f"last_accessed = datetime('now'){archived_clause} WHERE id = ?",
                 [(i,) for i in ids],
             )
             await db.commit()

diff --git a/core/permissions.py b/core/permissions.py
@@ -117,6 +117,7 @@ class PermissionLevel:
     "run_command:git*push*": "ASK",
     "run_command:git*commit*": "ASK",
     "web_search": "ALWAYS",
+    "recall_memory": "ALWAYS",
     # Write operations — ask first
     "send_email": "ASK",
     "reply_email": "ASK",

diff --git a/core/prompt_builder.py b/core/prompt_builder.py
@@ -181,7 +181,9 @@ def build_prompt_sections(
     memory_instruction = (
         "You can store and recall memories using the sqlite3 CLI (see the memory skill).\n"
         "Proactively remember important facts about the user and their contacts.\n"
-        "Before inserting a new long-term memory, check if it already exists to avoid duplicates."
+        "Before inserting a new long-term memory, check if it already exists to avoid duplicates.\n"
+        "Only your most relevant memories are shown each turn; when you suspect a stored fact "
+        "isn't among them, call the recall_memory tool to search your full memory by meaning."
     )
 
     history_handling = ""

diff --git a/docs/content/docs/memory.mdx b/docs/content/docs/memory.mdx
@@ -128,9 +128,21 @@ Set `memory.embedding.enabled: false` to fall back to **Tier-1 lexical** (word-o
 ### What embeddings power
 
 - **Relevance-ranked injection.** Instead of dumping the most recent `long_term_limit` rows into the prompt, only the `injection_top_k` (default 12) memories most relevant to the *current message* are injected. They're scored Generative-Agents style: `relevance + 0.5·importance + 0.3·recency`. Injection happens in the **per-turn preamble** (prepended to the current user message), not the static system prompt — so even in session mode (where the static prompt is snapshotted once) the current message is the query every turn, and a fact written mid-session is visible on the next turn without `/new`.
+- **On-demand recall** (the `recall_memory` tool, below).
 - **Dedup.** `update_memory` retrieves ADD/UPDATE/DELETE/NOOP candidates by cosine similarity (with a per-row lexical fallback for any memory that has no vector yet).
 - **Hygiene clustering** (Tier 4, below).
 
+### `recall_memory` — deliberate full-store lookup
+
+Relevance-ranked injection is a small always-on top-k: great for the obvious facts, but as the store grows it can't surface everything, and the agent *can't ask for a fact it doesn't suspect it has*. So injection is paired with a **`recall_memory` tool** the agent calls deliberately when it suspects a relevant stored fact isn't in the injected set (hybrid retrieval — always-on top-k **plus** on-demand recall).
+
+- Searches the agent's **entire** long-term store — **including archived** memories that injection never sees — ranked purely by semantic relevance to the agent's query (lexical-overlap fallback when embeddings are off).
+- Returns up to `recall_top_k` (default 10) matches above a small relevance floor.
+- Recalling **revives** the matched rows: they're reinforced (`access_count` / `last_accessed`) and any archived one is un-archived — a fact the agent looked up and used is warm again.
+- **Scope-aware** (#42): recall is filtered to `scope IN ('', <active persona>)` exactly like injection, so a persona only ever recalls shared facts plus its own private ones — never another persona's. The default identity recalls shared only.
+
+The tool is read-only (no approval prompt) and available to every persona (it can't surface anything injection couldn't already show that persona), alongside the sqlite CLI the memory skill teaches for exact-field queries and bulk edits.
+
 All of this is configurable live from the **admin Memory tab** (enable toggle, backend, model, top-k, plus Download / Test buttons).
 
 ## Tier 3 — Forgetting, importance & reinforcement
@@ -230,6 +242,7 @@ memory:
     base_url: ""                       # API providers only; falls back to agent provider base URL
     dimensions: 0                      # 0 = provider default (API providers only)
     injection_top_k: 12                # relevance-ranked memories injected per turn
+    recall_top_k: 10                   # max memories returned by the recall_memory tool
 
   # Tier 3 — forgetting / importance / reinforcement
   default_importance: 5.0              # 1-10 scale assigned to new long-term memories

diff --git a/skills/memory.md b/skills/memory.md
@@ -37,6 +37,22 @@ sqlite3 data/memory.db "UPDATE long_term SET content = 'New value', updated_at =
 
 ## Querying memories
 
+### Semantic recall (full store, by meaning)
+
+The sqlite queries below match on exact fields or `LIKE` substrings. To find a
+fact by **meaning** across your *entire* long-term store — including older,
+archived memories that aren't injected into the prompt — call the
+`recall_memory` tool instead of writing SQL:
+
+- Each turn only injects your few most-relevant memories. When you suspect a
+  stored fact exists beyond them (e.g. "didn't they mention a food allergy?"),
+  call `recall_memory` with a natural-language `query` describing the fact.
+- It ranks the whole store by semantic similarity and returns the best matches.
+  Recalling a memory also revives it (un-archives + reinforces it).
+
+Use `recall_memory` for fuzzy "do I know anything about X?" lookups; use the
+sqlite queries below when you need exact filtering, counts, or bulk edits.
+
 ### Search by subject
 
 ```bash