Skip to content

Commit 15a37e9

Browse files
committed
Refactor ask endpoint: Update to use POST method with JSON payload for improved API usage; modify test cases for new question formats. Clean up unnecessary code and enhance logging in RAG implementation.
1 parent ea0249b commit 15a37e9

File tree

7 files changed

+91
-123
lines changed

7 files changed

+91
-123
lines changed

.devcontainer/devcontainer.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
"extensions": [
2323
"ms-python.python",
2424
"zaaack.markdown-editor",
25-
"bierner.emojisense",
2625
"ms-python.debugpy"
2726
]
2827
}

.github/workflows/ci-build.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,15 @@ jobs:
4040
4141
- name: Test ask endpoint
4242
run: |
43-
# Test the synchronous ask endpoint with a simple query
44-
response=$(curl -s -f "http://localhost:8000/memoryalpha/rag/ask?question=What%20is%20the%20Enterprise?&thinkingmode=DISABLED&max_tokens=100&top_k=3")
45-
43+
# Test the ask endpoint with a simple query
44+
response=$(curl -X POST "http://localhost:8000/memoryalpha/rag/ask" -H "Content-Type: application/json" -d '{
45+
"question": "What is the color of Vulcan blood?"
46+
}')
4647
# Check if response contains expected content
47-
if echo "$response" | grep -q "Enterprise"; then
48+
if echo "$response" | grep -q "green"; then
4849
echo "✅ Ask endpoint test passed"
4950
else
50-
echo "❌ Ask endpoint test failed - no relevant content found"
51+
echo "❌ Ask endpoint test failed, answer did not contain expected content"
5152
echo "Response: $response"
5253
exit 1
5354
fi

.github/workflows/pr-check.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,15 @@ jobs:
3939
4040
- name: Test ask endpoint
4141
run: |
42-
# Test the synchronous ask endpoint with a simple query
43-
response=$(curl -s -f "http://localhost:8000/memoryalpha/rag/ask?question=What%20is%20the%20Enterprise?&thinkingmode=DISABLED&max_tokens=100&top_k=3")
44-
42+
# Test the ask endpoint with a simple query
43+
response=$(curl -X POST "http://localhost:8000/memoryalpha/rag/ask" -H "Content-Type: application/json" -d '{
44+
"question": "What was the name of human who discovered warp drive?"
45+
}')
4546
# Check if response contains expected content
46-
if echo "$response" | grep -q "Enterprise"; then
47+
if echo "$response" | grep -q "Zefram Cochrane"; then
4748
echo "✅ Ask endpoint test passed"
4849
else
49-
echo "❌ Ask endpoint test failed - no relevant content found"
50+
echo "❌ Ask endpoint test failed, answer did not contain expected content"
5051
echo "Response: $response"
5152
exit 1
5253
fi

api/memoryalpha/ask.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,53 @@
1-
from fastapi import APIRouter, Query
1+
from fastapi import APIRouter, Query, Body
22
from fastapi.responses import JSONResponse
3+
from pydantic import BaseModel
4+
from typing import Optional
35

4-
from .rag import MemoryAlphaRAG, ThinkingMode
6+
from .rag import MemoryAlphaRAG
57

68
router = APIRouter()
79

810
# Singleton or global instance for demo; in production, manage lifecycle properly
911
rag_instance = MemoryAlphaRAG()
10-
ThinkingMode = ThinkingMode
12+
13+
class AskRequest(BaseModel):
14+
question: str
15+
max_tokens: Optional[int] = 2048
16+
top_k: Optional[int] = 10
17+
top_p: Optional[float] = 0.8
18+
temperature: Optional[float] = 0.3
19+
20+
@router.post("/memoryalpha/rag/ask")
21+
def ask_endpoint_post(request: AskRequest):
22+
"""
23+
Query the RAG pipeline and return the full response.
24+
Accepts POST requests with JSON payload for cleaner API usage.
25+
"""
26+
try:
27+
answer = rag_instance.ask(
28+
request.question,
29+
max_tokens=request.max_tokens,
30+
top_k=request.top_k,
31+
top_p=request.top_p,
32+
temperature=request.temperature
33+
)
34+
return JSONResponse(content={"response": answer})
35+
except Exception as e:
36+
return JSONResponse(status_code=500, content={"error": str(e)})
1137

1238
@router.get("/memoryalpha/rag/ask")
1339
def ask_endpoint(
1440
question: str = Query(..., description="The user question"),
15-
thinkingmode: str = Query("DISABLED", description="Thinking mode: DISABLED, QUIET, or VERBOSE"),
1641
max_tokens: int = Query(2048, description="Maximum tokens to generate"),
1742
top_k: int = Query(10, description="Number of documents to retrieve"),
1843
top_p: float = Query(0.8, description="Sampling parameter"),
1944
temperature: float = Query(0.3, description="Randomness/creativity of output")
2045
):
2146
"""
22-
Query the RAG pipeline and return the full response (including thinking if enabled).
47+
Query the RAG pipeline and return the full response.
2348
Now uses advanced tool-enabled RAG by default for better results.
2449
"""
2550
try:
26-
# Set the thinking mode for this request
27-
rag_instance.thinking_mode = ThinkingMode[thinkingmode.upper()]
2851
answer = rag_instance.ask(
2952
question,
3053
max_tokens=max_tokens,

api/memoryalpha/rag.py

Lines changed: 47 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -18,43 +18,10 @@
1818
import chromadb
1919
from chromadb.config import Settings
2020

21-
"""
22-
ThinkingMode enum for controlling model reasoning display
23-
"""
24-
25-
from enum import Enum
26-
27-
class ThinkingMode(Enum):
28-
DISABLED = "disabled"
29-
QUIET = "quiet"
30-
VERBOSE = "verbose"
31-
3221
logger = logging.getLogger(__name__)
22+
logger.setLevel(logging.DEBUG)
3323
warnings.filterwarnings("ignore", message=".*encoder_attention_mask.*is deprecated.*", category=FutureWarning)
3424

35-
def get_system_prompt(thinking_mode: ThinkingMode) -> str:
36-
"""Generate the LCARS-style system prompt based on thinking mode"""
37-
38-
base_prompt = """You are an LCARS computer system with access to Star Trek Memory Alpha records.
39-
40-
CRITICAL INSTRUCTIONS:
41-
- You MUST answer ONLY using information from the provided records
42-
- If the records don't contain relevant information, say "I don't have information about that in my records"
43-
- DO NOT make up information, invent characters, or hallucinate details
44-
- DO NOT use external knowledge about Star Trek - only use the provided records
45-
- AVOID mirror universe references unless specifically asked about it
46-
- If asked about something not in the records, be honest about the limitation
47-
- Stay in character as an LCARS computer system at all times
48-
49-
"""
50-
51-
if thinking_mode == ThinkingMode.DISABLED:
52-
return base_prompt + "Answer directly in a single paragraph without thinking tags."
53-
elif thinking_mode == ThinkingMode.QUIET:
54-
return base_prompt + "Use <think> tags for internal analysis, then provide your final answer in a single paragraph."
55-
else: # VERBOSE
56-
return base_prompt + "Use <think> tags for analysis, then provide your final answer in a single paragraph."
57-
5825
def get_user_prompt(context_text: str, query: str) -> str:
5926
"""Format user prompt with context and query"""
6027

@@ -73,9 +40,7 @@ def __init__(self,
7340
chroma_db_path: str = os.getenv("DB_PATH"),
7441
ollama_url: str = os.getenv("OLLAMA_URL"),
7542
collection_name: str = os.getenv("COLLECTION_NAME", "memoryalpha"),
76-
thinking_mode: ThinkingMode = ThinkingMode.DISABLED,
77-
max_history_turns: int = 5,
78-
thinking_text: str = "Processing..."):
43+
max_history_turns: int = 5):
7944

8045
if not chroma_db_path:
8146
raise ValueError("chroma_db_path must be provided or set in CHROMA_DB_PATH environment variable.")
@@ -85,9 +50,7 @@ def __init__(self,
8550
self.chroma_db_path = chroma_db_path
8651
self.ollama_url = ollama_url
8752
self.collection_name = collection_name
88-
self.thinking_mode = thinking_mode
8953
self.max_history_turns = max_history_turns
90-
self.thinking_text = thinking_text
9154
self.conversation_history: List[Dict[str, str]] = []
9255

9356
# Initialize lightweight components
@@ -170,14 +133,33 @@ def search(self, query: str, top_k: int = 10) -> List[Dict[str, Any]]:
170133
"distance": dist
171134
})
172135

173-
# Rerank with cross-encoder if available
174-
if self.cross_encoder and len(docs) > 1:
175-
pairs = [[query, doc["content"][:500]] for doc in docs]
176-
scores = self.cross_encoder.predict(pairs)
177-
for doc, score in zip(docs, scores):
178-
doc["score"] = float(score)
179-
docs = sorted(docs, key=lambda d: d["score"], reverse=True)
180-
136+
# Re-rank using cross-encoder if available
137+
if self.cross_encoder and len(docs) > top_k:
138+
logger.info("Re-ranking results with cross-encoder")
139+
# Limit to top candidates for re-ranking to avoid performance issues
140+
rerank_candidates = docs[:min(len(docs), top_k + 5)] # Only re-rank top candidates
141+
142+
# Prepare pairs for cross-encoder with truncated content
143+
pairs = []
144+
for doc in rerank_candidates:
145+
content = doc['content']
146+
if len(content) > 512: # Truncate long content for cross-encoder
147+
content = content[:512]
148+
pairs.append([query, content])
149+
150+
try:
151+
scores = self.cross_encoder.predict(pairs)
152+
153+
# Sort by cross-encoder scores (higher is better)
154+
ranked_docs = sorted(zip(rerank_candidates, scores), key=lambda x: x[1], reverse=True)
155+
reranked = [doc for doc, score in ranked_docs]
156+
157+
# Replace original docs with re-ranked ones
158+
docs = reranked + docs[len(rerank_candidates):]
159+
logger.info(f"Cross-encoder re-ranking completed, top score: {scores[0]:.4f}")
160+
except Exception as e:
161+
logger.warning(f"Cross-encoder re-ranking failed: {e}, using original ranking")
162+
# Continue with original docs if re-ranking fails
181163
return docs[:top_k]
182164

183165
except Exception as e:
@@ -187,7 +169,18 @@ def search(self, query: str, top_k: int = 10) -> List[Dict[str, Any]]:
187169
def build_prompt(self, query: str, docs: List[Dict[str, Any]]) -> tuple[str, str]:
188170
"""Build the prompt with retrieved documents."""
189171

190-
system_prompt = get_system_prompt(self.thinking_mode)
172+
system_prompt = """You are an LCARS computer system with access to Star Trek Memory Alpha records.
173+
174+
CRITICAL INSTRUCTIONS:
175+
- You MUST answer ONLY using information from the provided records
176+
- If the records don't contain relevant information, say "I don't have information about that in my records"
177+
- DO NOT make up information, invent characters, or hallucinate details
178+
- DO NOT use external knowledge about Star Trek - only use the provided records
179+
- AVOID mirror universe references unless specifically asked about it
180+
- If asked about something not in the records, be honest about the limitation
181+
- Stay in character as an LCARS computer system at all times
182+
183+
Answer directly in a single paragraph."""
191184

192185
if not docs:
193186
context_text = ""
@@ -283,7 +276,7 @@ def ask(self, query: str, max_tokens: int = 2048, top_k: int = 10, top_p: float
283276
- Do NOT directly use the input question, only use keywords from it
284277
- Use only key terms from the input question for seaching
285278
- If insufficient information is found on the first try, retry with variations or relevant info from previous queries
286-
- DISCARD details from alternate universes or timelines
279+
- DISCARD details from alternate universes, books or timelines
287280
- DISREGARD details about books, comics, or non-canon sources
288281
- NEVER mention appearances or actors, only in-universe details
289282
- Ensure a complete answer can be formulated before stopping searches
@@ -294,8 +287,7 @@ def ask(self, query: str, max_tokens: int = 2048, top_k: int = 10, top_p: float
294287
- Provide your final answer clearly and concisely
295288
- Do not add details that are irrelevant to the question
296289
- Stay in-character as an LCARS computer system at all times, do not allude to the Star Trek universe itself or it being a fictional setting
297-
- Do not mention the search results, only the final in-universe answer
298-
- Do not end responses with thinking content"""
290+
- Do not mention the search results, only the final in-universe answer"""
299291

300292
messages = [
301293
{"role": "system", "content": system_prompt},
@@ -316,13 +308,13 @@ def ask(self, query: str, max_tokens: int = 2048, top_k: int = 10, top_p: float
316308
model=model,
317309
messages=messages,
318310
stream=False,
311+
think=False,
319312
options={"temperature": temperature, "top_p": top_p, "num_predict": max_tokens},
320313
tools=[search_tool_definition]
321314
)
322315

323316
response_message = result['message']
324317
logger.info(f"LLM response type: {type(response_message)}")
325-
logger.debug(f"Response message attributes: {dir(response_message)}")
326318
logger.debug(f"Response message content: {response_message.get('content', 'No content')[:200]}...")
327319

328320
# Check if the model wants to use a tool
@@ -379,23 +371,10 @@ def ask(self, query: str, max_tokens: int = 2048, top_k: int = 10, top_p: float
379371
logger.info(f"Final response preview: {final_response[:200]}...")
380372
logger.debug(f"Raw final response: {repr(final_response[:500])}")
381373

382-
# Always clean the response first to remove thinking tags and unwanted content
383-
final_response = self._clean_response(final_response)
384-
logger.debug(f"After cleaning: {repr(final_response[:500])}")
385-
386-
# If cleaning removed everything, the LLM was just thinking without answering
387-
if not final_response.strip():
388-
logger.warning("LLM response was only thinking content, no final answer provided")
389-
final_response = "I apologize, but I was unable to find sufficient information to answer your question based on the available Memory Alpha records."
374+
# Remove ANSI codes and LCARS prefix
375+
final_response = re.sub(r"\033\[[0-9;]*m", "", final_response)
376+
final_response = final_response.replace("LCARS: ", "").strip()
390377

391-
logger.info(f"Thinking mode: {self.thinking_mode}")
392-
logger.info(f"Final cleaned response: {final_response[:200]}...")
393-
394-
# Handle thinking mode response processing
395-
if self.thinking_mode == ThinkingMode.QUIET:
396-
final_response = self._replace_thinking_tags(final_response)
397-
# For DISABLED and VERBOSE modes, the response is already clean
398-
399378
self._update_history(query, final_response)
400379
logger.info("Returning final answer")
401380
return final_response
@@ -408,41 +387,6 @@ def ask(self, query: str, max_tokens: int = 2048, top_k: int = 10, top_p: float
408387
logger.warning(f"Max iterations reached for query: {query}")
409388
return "Query processing exceeded maximum iterations. Please try a simpler question."
410389

411-
def _clean_response(self, answer: str) -> str:
412-
"""Clean response by removing ANSI codes and thinking tags."""
413-
if not answer:
414-
return ""
415-
416-
# Remove ANSI codes
417-
clean = re.sub(r"\033\[[0-9;]*m", "", answer)
418-
# Remove LCARS prefix
419-
clean = clean.replace("LCARS: ", "").strip()
420-
421-
# Remove thinking tags and their content - multiple patterns
422-
# Pattern 1: Complete <think>...</think> blocks
423-
clean = re.sub(r'<think>.*?</think>', '', clean, flags=re.DOTALL | re.IGNORECASE)
424-
# Pattern 2: Unclosed <think> tags
425-
clean = re.sub(r'<think>.*?(?=<think>|</think>|$)', '', clean, flags=re.DOTALL | re.IGNORECASE)
426-
# Pattern 3: Any remaining think tags
427-
clean = re.sub(r'</?think>', '', clean, flags=re.IGNORECASE)
428-
# Pattern 4: Alternative thinking formats
429-
clean = re.sub(r'<thinking>.*?</thinking>', '', clean, flags=re.DOTALL | re.IGNORECASE)
430-
431-
# Remove extra whitespace and newlines
432-
clean = re.sub(r'\n\s*\n', '\n', clean)
433-
clean = clean.strip()
434-
435-
return clean
436-
437-
def _replace_thinking_tags(self, answer: str) -> str:
438-
"""Replace thinking tags with processing text."""
439-
clean = re.sub(r"\033\[[0-9;]*m", "", answer).replace("LCARS: ", "").strip()
440-
while "<think>" in clean and "</think>" in clean:
441-
start = clean.find("<think>")
442-
end = clean.find("</think>") + len("</think>")
443-
clean = clean[:start] + self.thinking_text + clean[end:]
444-
return clean.strip()
445-
446390
def _update_history(self, question: str, answer: str):
447391
"""Update conversation history."""
448392
self.conversation_history.append({"question": question, "answer": answer})

chat.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# Interactive chat script for MemoryAlpha RAG API
44
BASE_URL="http://localhost:8000"
5-
THINKING_MODE="DISABLED"
5+
THINKING_MODE="VERBOSE"
66
MAX_TOKENS=2048
77
TOP_K=5
88
TOP_P=0.8

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ services:
1111
- odn
1212
env_file:
1313
- .env
14-
14+
1515
lcars:
1616
build:
1717
context: .

0 commit comments

Comments
 (0)