Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions engram-test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from engram import EngramClient
import os
import uuid

Check failure on line 3 in engram-test.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

engram-test.py:3:8: F401 `uuid` imported but unused

client = EngramClient(
api_key=os.getenv("ENGRAM_API_KEY"),
base_url="https://dev-engram.labs.weaviate.io"
)

user_id = f"connor-1234567890"

Check failure on line 10 in engram-test.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F541)

engram-test.py:10:11: F541 f-string without any placeholders

'''
run = client.memories.add(
"My favorite mattress brand is Purple",
user_id=user_id,
)

print(f"Run ID: {run.run_id}")
print(f"Status: {run.status}")
'''

results = client.memories.search(
query="What is the user's favorite mattress brand?",
user_id=user_id,
)

for result in results:
print(result)

"""
# Returns:
Memory(id='08505722-6e44-43b5-bb17-2091a58fa834', project_id='019ce79c-d6d6-7dea-8687-edaacade55ee', content="The user's favorite mattress brand is Purple.", topic='UserKnowledge', group='default', created_at='2026-03-19T13:15:46.668Z', updated_at='2026-03-19T13:15:46.668Z', user_id='connor-1234567890', conversation_id=None, tags=None, score=1)
"""
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ name = "query-agent-benchmarking"
version = "0.6"
description="A Python library for benchmarking Weaviate's Query Agent!"
readme="README.md"
requires-python = ">=3.10"
requires-python = ">=3.11"
dependencies = [
"dspy>=3.0.4",
"sentence-transformers>=5.0.0",
Expand All @@ -20,7 +20,8 @@ dependencies = [
"setuptools>=80.9.0",
"wheel>=0.45.1",
"twine>=6.2.0",
"PyMuPDF>=1.24.0",
"PyMuPDF>=1.24.0,!=1.27.2.2",
"weaviate-engram>=0.4.0",
]

[tool.setuptools.packages.find]
Expand Down
4 changes: 3 additions & 1 deletion query_agent_benchmarking/agent/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from .search_agent import SearchAgentBuilder
from .ask_agent import AskAgentBuilder
from .base import BaseAgentBuilder
from .engram_ask_agent import EngramAskAgent

__all__ = [
"SearchAgentBuilder",
"AskAgentBuilder",
"AskAgentBuilder",
"BaseAgentBuilder",
"EngramAskAgent",
]

143 changes: 143 additions & 0 deletions query_agent_benchmarking/agent/engram_ask_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from typing import Optional

from engram import EngramClient
import openai

from query_agent_benchmarking.agent.ask_agent import AskResponse


DEFAULT_SYSTEM_PROMPT = (
"You are a helpful assistant that answers questions based on the user's memories. "
"Use the provided memories as context to answer the question. "
"If the memories don't contain enough information, say so."
)


class EngramAskAgent:
"""
Ask agent that retrieves memories from Engram and generates answers via an LLM.

Flow: query -> Engram search (scoped by tenant_id) -> build context -> LLM answer.

This agent does not extend BaseAgentBuilder since it doesn't need a Weaviate connection.
It follows the same interface (run/run_async returning AskResponse) so it plugs directly
into run_ask_queries() / run_ask_queries_async().

For multi-tenant datasets like LongMemEval, each query's tenant_id is used as the
Engram user_id, optionally prefixed with engram_user_id_prefix.
"""

def __init__(
self,
engram_api_key: str,
engram_base_url: str,
engram_user_id_prefix: str = "",
llm_model: str = "gpt-4.1",
system_prompt: Optional[str] = None,
openai_api_key: Optional[str] = None,
):
self.engram_client = EngramClient(
api_key=engram_api_key,
base_url=engram_base_url,
)
self.engram_user_id_prefix = engram_user_id_prefix
self.llm_model = llm_model
self.system_prompt = system_prompt or DEFAULT_SYSTEM_PROMPT

if openai_api_key:
self.openai_client = openai.OpenAI(api_key=openai_api_key)
else:
self.openai_client = openai.OpenAI()

print(f"EngramAskAgent initialized:")

Check failure on line 52 in query_agent_benchmarking/agent/engram_ask_agent.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F541)

query_agent_benchmarking/agent/engram_ask_agent.py:52:15: F541 f-string without any placeholders
print(f" Engram base URL: {engram_base_url}")
print(f" User ID prefix: {engram_user_id_prefix!r}")
print(f" LLM model: {llm_model}")

def _resolve_user_id(self, tenant_id: Optional[str]) -> str:
"""Map a query's tenant_id to an Engram user_id."""
if not tenant_id:
raise ValueError(
"EngramAskAgent requires tenant_id on each query to scope memory search. "
"Ensure your dataset loader populates InMemoryAskQuery.tenant_id."
)
return f"{self.engram_user_id_prefix}{tenant_id}"

def _build_context(self, memories) -> str:
"""Build a context string from Engram memory search results."""
parts = []
for i, memory in enumerate(memories, 1):
parts.append(f"Memory {i}: {memory.content}")
return "\n".join(parts)

def _generate_answer(self, question: str, context: str) -> str:
"""Generate an answer from the retrieved memories using an LLM."""
user_message = (
f"Based on the following memories, answer the question.\n\n"
f"Memories:\n{context}\n\n"
f"Question: {question}"
)
response = self.openai_client.chat.completions.create(
model=self.llm_model,
messages=[
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": user_message},
],
)
return response.choices[0].message.content or ""

def run(
self,
query: str,
oracle_context_id: Optional[str] = None,
tenant_id: Optional[str] = None,
) -> AskResponse:
"""
Run a synchronous ask query via Engram retrieval + LLM generation.

Args:
query: The question to answer.
oracle_context_id: Unused, kept for interface compatibility.
tenant_id: Maps to Engram user_id for scoping memory search.
"""
user_id = self._resolve_user_id(tenant_id)
memories = self.engram_client.memories.search(
query=query,
user_id=user_id,
)
context = self._build_context(memories)
answer = self._generate_answer(query, context)

return AskResponse(
final_answer=answer,
raw_response={"memories": [m.content for m in memories], "answer": answer},
)

async def run_async(
self,
query: str,
oracle_context_id: Optional[str] = None,
tenant_id: Optional[str] = None,
) -> AskResponse:
"""
Run an async ask query. Currently wraps the sync implementation.

Args:
query: The question to answer.
oracle_context_id: Unused, kept for interface compatibility.
tenant_id: Maps to Engram user_id for scoping memory search.
"""
import asyncio
return await asyncio.to_thread(self.run, query, oracle_context_id, tenant_id)

async def initialize_async(self):
"""No-op — Engram client is initialized in __init__."""
pass

async def close_async(self):
"""No-op — no persistent connections to close."""
pass

def close_sync(self):
"""No-op — no persistent connections to close."""
pass
22 changes: 17 additions & 5 deletions query_agent_benchmarking/ask_benchmark_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pathlib import Path
from typing import Optional, Any, Union

from query_agent_benchmarking.agent import AskAgentBuilder
from query_agent_benchmarking.agent import AskAgentBuilder, EngramAskAgent
from query_agent_benchmarking.models import (
DocsCollection,
AskQueriesCollection,
Expand Down Expand Up @@ -51,8 +51,11 @@ async def _run_ask_eval(config: dict[str, Any]) -> dict[str, Any]:
dataset_name = config.get("ask_dataset")
docs_collection = config.get("docs_collection")
queries_input = config.get("queries")

# Load queries

# Load queries — fall back to ask_dataset if queries is not explicitly provided
if not queries_input and dataset_name:
queries_input = dataset_name

if queries_input:
if isinstance(queries_input, AskQueriesCollection):
# Custom Weaviate collection
Expand Down Expand Up @@ -88,7 +91,7 @@ async def _run_ask_eval(config: dict[str, Any]) -> dict[str, Any]:
f"Got: {type(queries_input)}"
)
else:
raise ValueError("Must provide 'queries' for ask evaluation")
raise ValueError("Must provide 'queries' or 'ask_dataset' for ask evaluation")

# Determine dataset identifier
if docs_collection:
Expand Down Expand Up @@ -138,7 +141,16 @@ async def _run_ask_eval(config: dict[str, Any]) -> dict[str, Any]:
# Add agent_name to config for result serialization
config["agent_name"] = agent_name

if docs_collection:
if agent_name == "engram":
import os
ask_agent = EngramAskAgent(
engram_api_key=config.get("engram_api_key") or os.getenv("ENGRAM_API_KEY", ""),
engram_base_url=config.get("engram_base_url") or os.getenv("ENGRAM_BASE_URL", ""),
engram_user_id_prefix=config.get("engram_user_id_prefix", ""),
llm_model=config.get("engram_llm_model", "gpt-4.1"),
system_prompt=system_prompt,
)
elif docs_collection:
ask_agent = AskAgentBuilder(
agent_name=agent_name,
docs_collection=docs_collection,
Expand Down
5 changes: 3 additions & 2 deletions query_agent_benchmarking/benchmark-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@ search_target: "image_content_weaviate"
# Ask Mode Settings
# =============================================================================
ask_agent_name: "query-agent-ask"
ask_dataset: multihoprag
queries: "multihoprag"
ask_dataset: longmemeval-s

# LLM Judge configuration
judge_model: "openai/gpt-4.1"
ensemble_k: 3

engram_base_url: https://dev-engram.labs.weaviate.io

# =============================================================================
# Shared Settings
# =============================================================================
Expand Down
1 change: 1 addition & 0 deletions query_agent_benchmarking/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

supported_ask_datasets = (
"irpapers", # Uses IRPapers_Default collection
"longmemeval-s", # Uses Engram memory layer or LongmemevalS_Default collection
"multihoprag", # Uses MultiHopRAG_Default collection
"officeqa", # Uses OfficeQA_Default collection (local PDFs)
)
Expand Down
2 changes: 2 additions & 0 deletions query_agent_benchmarking/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
create_collection_with_vector_config,
get_vector_config,
)
from .engram_loader import engram_loader

__all__ = [
"DatasetSpec",
Expand All @@ -16,4 +17,5 @@
"database_loader",
"create_collection_with_vector_config",
"get_vector_config",
"engram_loader",
]
12 changes: 10 additions & 2 deletions query_agent_benchmarking/database/database_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,15 +122,23 @@ def create_collection_with_vector_config(
# Public API: primary entry point used by `scripts/populate-db.py` and package users.
def database_loader(recreate: bool = True, tag: str = "Default") -> None:
"""
Load dataset from config and populate Weaviate collection.
Load dataset from config and populate Weaviate collection (or Engram if use_engram is set).

Args:
recreate: Whether to drop existing collection before creating
tag: Suffix to add to collection name
"""
config_path = Path(__file__).parent / "database_loader_config.yml"
config = load_config(config_path)

if config.get("use_engram", False):
from .engram_loader import engram_loader
engram_loader(
dataset_name=config.get("dataset_name"),
engram_base_url=config.get("engram_base_url"),
)
return

# Get provider headers for all configured embedding providers.
headers = _resolve_provider_headers(
embedding_providers=config.get("embedding_providers"),
Expand Down
6 changes: 5 additions & 1 deletion query_agent_benchmarking/database/database_loader_config.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dataset_name: irpapers
dataset_name: longmemeval-s

# NOTE: This is a work in progess, need a better abstraction for multiple models from the same provider.
# The problem is X_weaviate_snowflake_... ends up being really verbose when you need to
Expand All @@ -17,3 +17,7 @@ ksim: 4
dprojections: 16
repetitions: 10
ef: 500

# Engram parameters
use_engram: True
engram_base_url: https://dev-engram.labs.weaviate.io
Loading
Loading