1818import chromadb
1919from chromadb .config import Settings
2020
21- """
22- ThinkingMode enum for controlling model reasoning display
23- """
24-
25- from enum import Enum
26-
27- class ThinkingMode (Enum ):
28- DISABLED = "disabled"
29- QUIET = "quiet"
30- VERBOSE = "verbose"
31-
3221logger = logging .getLogger (__name__ )
22+ logger .setLevel (logging .DEBUG )
3323warnings .filterwarnings ("ignore" , message = ".*encoder_attention_mask.*is deprecated.*" , category = FutureWarning )
3424
35- def get_system_prompt (thinking_mode : ThinkingMode ) -> str :
36- """Generate the LCARS-style system prompt based on thinking mode"""
37-
38- base_prompt = """You are an LCARS computer system with access to Star Trek Memory Alpha records.
39-
40- CRITICAL INSTRUCTIONS:
41- - You MUST answer ONLY using information from the provided records
42- - If the records don't contain relevant information, say "I don't have information about that in my records"
43- - DO NOT make up information, invent characters, or hallucinate details
44- - DO NOT use external knowledge about Star Trek - only use the provided records
45- - AVOID mirror universe references unless specifically asked about it
46- - If asked about something not in the records, be honest about the limitation
47- - Stay in character as an LCARS computer system at all times
48-
49- """
50-
51- if thinking_mode == ThinkingMode .DISABLED :
52- return base_prompt + "Answer directly in a single paragraph without thinking tags."
53- elif thinking_mode == ThinkingMode .QUIET :
54- return base_prompt + "Use <think> tags for internal analysis, then provide your final answer in a single paragraph."
55- else : # VERBOSE
56- return base_prompt + "Use <think> tags for analysis, then provide your final answer in a single paragraph."
57-
5825def get_user_prompt (context_text : str , query : str ) -> str :
5926 """Format user prompt with context and query"""
6027
@@ -73,9 +40,7 @@ def __init__(self,
7340 chroma_db_path : str = os .getenv ("DB_PATH" ),
7441 ollama_url : str = os .getenv ("OLLAMA_URL" ),
7542 collection_name : str = os .getenv ("COLLECTION_NAME" , "memoryalpha" ),
76- thinking_mode : ThinkingMode = ThinkingMode .DISABLED ,
77- max_history_turns : int = 5 ,
78- thinking_text : str = "Processing..." ):
43+ max_history_turns : int = 5 ):
7944
8045 if not chroma_db_path :
8146 raise ValueError ("chroma_db_path must be provided or set in CHROMA_DB_PATH environment variable." )
@@ -85,9 +50,7 @@ def __init__(self,
8550 self .chroma_db_path = chroma_db_path
8651 self .ollama_url = ollama_url
8752 self .collection_name = collection_name
88- self .thinking_mode = thinking_mode
8953 self .max_history_turns = max_history_turns
90- self .thinking_text = thinking_text
9154 self .conversation_history : List [Dict [str , str ]] = []
9255
9356 # Initialize lightweight components
@@ -170,14 +133,33 @@ def search(self, query: str, top_k: int = 10) -> List[Dict[str, Any]]:
170133 "distance" : dist
171134 })
172135
173- # Rerank with cross-encoder if available
174- if self .cross_encoder and len (docs ) > 1 :
175- pairs = [[query , doc ["content" ][:500 ]] for doc in docs ]
176- scores = self .cross_encoder .predict (pairs )
177- for doc , score in zip (docs , scores ):
178- doc ["score" ] = float (score )
179- docs = sorted (docs , key = lambda d : d ["score" ], reverse = True )
180-
136+ # Re-rank using cross-encoder if available
137+ if self .cross_encoder and len (docs ) > top_k :
138+ logger .info ("Re-ranking results with cross-encoder" )
139+ # Limit to top candidates for re-ranking to avoid performance issues
140+ rerank_candidates = docs [:min (len (docs ), top_k + 5 )] # Only re-rank top candidates
141+
142+ # Prepare pairs for cross-encoder with truncated content
143+ pairs = []
144+ for doc in rerank_candidates :
145+ content = doc ['content' ]
146+ if len (content ) > 512 : # Truncate long content for cross-encoder
147+ content = content [:512 ]
148+ pairs .append ([query , content ])
149+
150+ try :
151+ scores = self .cross_encoder .predict (pairs )
152+
153+ # Sort by cross-encoder scores (higher is better)
154+ ranked_docs = sorted (zip (rerank_candidates , scores ), key = lambda x : x [1 ], reverse = True )
155+ reranked = [doc for doc , score in ranked_docs ]
156+
157+ # Replace original docs with re-ranked ones
158+ docs = reranked + docs [len (rerank_candidates ):]
159+ logger .info (f"Cross-encoder re-ranking completed, top score: { scores [0 ]:.4f} " )
160+ except Exception as e :
161+ logger .warning (f"Cross-encoder re-ranking failed: { e } , using original ranking" )
162+ # Continue with original docs if re-ranking fails
181163 return docs [:top_k ]
182164
183165 except Exception as e :
@@ -187,7 +169,18 @@ def search(self, query: str, top_k: int = 10) -> List[Dict[str, Any]]:
187169 def build_prompt (self , query : str , docs : List [Dict [str , Any ]]) -> tuple [str , str ]:
188170 """Build the prompt with retrieved documents."""
189171
190- system_prompt = get_system_prompt (self .thinking_mode )
172+ system_prompt = """You are an LCARS computer system with access to Star Trek Memory Alpha records.
173+
174+ CRITICAL INSTRUCTIONS:
175+ - You MUST answer ONLY using information from the provided records
176+ - If the records don't contain relevant information, say "I don't have information about that in my records"
177+ - DO NOT make up information, invent characters, or hallucinate details
178+ - DO NOT use external knowledge about Star Trek - only use the provided records
179+ - AVOID mirror universe references unless specifically asked about it
180+ - If asked about something not in the records, be honest about the limitation
181+ - Stay in character as an LCARS computer system at all times
182+
183+ Answer directly in a single paragraph."""
191184
192185 if not docs :
193186 context_text = ""
@@ -283,7 +276,7 @@ def ask(self, query: str, max_tokens: int = 2048, top_k: int = 10, top_p: float
283276- Do NOT directly use the input question, only use keywords from it
284277- Use only key terms from the input question for seaching
285278- If insufficient information is found on the first try, retry with variations or relevant info from previous queries
286- - DISCARD details from alternate universes or timelines
279+ - DISCARD details from alternate universes, books or timelines
287280- DISREGARD details about books, comics, or non-canon sources
288281- NEVER mention appearances or actors, only in-universe details
289282- Ensure a complete answer can be formulated before stopping searches
@@ -294,8 +287,7 @@ def ask(self, query: str, max_tokens: int = 2048, top_k: int = 10, top_p: float
294287- Provide your final answer clearly and concisely
295288- Do not add details that are irrelevant to the question
296289- Stay in-character as an LCARS computer system at all times, do not allude to the Star Trek universe itself or it being a fictional setting
297- - Do not mention the search results, only the final in-universe answer
298- - Do not end responses with thinking content"""
290+ - Do not mention the search results, only the final in-universe answer"""
299291
300292 messages = [
301293 {"role" : "system" , "content" : system_prompt },
@@ -316,13 +308,13 @@ def ask(self, query: str, max_tokens: int = 2048, top_k: int = 10, top_p: float
316308 model = model ,
317309 messages = messages ,
318310 stream = False ,
311+ think = False ,
319312 options = {"temperature" : temperature , "top_p" : top_p , "num_predict" : max_tokens },
320313 tools = [search_tool_definition ]
321314 )
322315
323316 response_message = result ['message' ]
324317 logger .info (f"LLM response type: { type (response_message )} " )
325- logger .debug (f"Response message attributes: { dir (response_message )} " )
326318 logger .debug (f"Response message content: { response_message .get ('content' , 'No content' )[:200 ]} ..." )
327319
328320 # Check if the model wants to use a tool
@@ -379,23 +371,10 @@ def ask(self, query: str, max_tokens: int = 2048, top_k: int = 10, top_p: float
379371 logger .info (f"Final response preview: { final_response [:200 ]} ..." )
380372 logger .debug (f"Raw final response: { repr (final_response [:500 ])} " )
381373
382- # Always clean the response first to remove thinking tags and unwanted content
383- final_response = self ._clean_response (final_response )
384- logger .debug (f"After cleaning: { repr (final_response [:500 ])} " )
385-
386- # If cleaning removed everything, the LLM was just thinking without answering
387- if not final_response .strip ():
388- logger .warning ("LLM response was only thinking content, no final answer provided" )
389- final_response = "I apologize, but I was unable to find sufficient information to answer your question based on the available Memory Alpha records."
374+ # Remove ANSI codes and LCARS prefix
375+ final_response = re .sub (r"\033\[[0-9;]*m" , "" , final_response )
376+ final_response = final_response .replace ("LCARS: " , "" ).strip ()
390377
391- logger .info (f"Thinking mode: { self .thinking_mode } " )
392- logger .info (f"Final cleaned response: { final_response [:200 ]} ..." )
393-
394- # Handle thinking mode response processing
395- if self .thinking_mode == ThinkingMode .QUIET :
396- final_response = self ._replace_thinking_tags (final_response )
397- # For DISABLED and VERBOSE modes, the response is already clean
398-
399378 self ._update_history (query , final_response )
400379 logger .info ("Returning final answer" )
401380 return final_response
@@ -408,41 +387,6 @@ def ask(self, query: str, max_tokens: int = 2048, top_k: int = 10, top_p: float
408387 logger .warning (f"Max iterations reached for query: { query } " )
409388 return "Query processing exceeded maximum iterations. Please try a simpler question."
410389
411- def _clean_response (self , answer : str ) -> str :
412- """Clean response by removing ANSI codes and thinking tags."""
413- if not answer :
414- return ""
415-
416- # Remove ANSI codes
417- clean = re .sub (r"\033\[[0-9;]*m" , "" , answer )
418- # Remove LCARS prefix
419- clean = clean .replace ("LCARS: " , "" ).strip ()
420-
421- # Remove thinking tags and their content - multiple patterns
422- # Pattern 1: Complete <think>...</think> blocks
423- clean = re .sub (r'<think>.*?</think>' , '' , clean , flags = re .DOTALL | re .IGNORECASE )
424- # Pattern 2: Unclosed <think> tags
425- clean = re .sub (r'<think>.*?(?=<think>|</think>|$)' , '' , clean , flags = re .DOTALL | re .IGNORECASE )
426- # Pattern 3: Any remaining think tags
427- clean = re .sub (r'</?think>' , '' , clean , flags = re .IGNORECASE )
428- # Pattern 4: Alternative thinking formats
429- clean = re .sub (r'<thinking>.*?</thinking>' , '' , clean , flags = re .DOTALL | re .IGNORECASE )
430-
431- # Remove extra whitespace and newlines
432- clean = re .sub (r'\n\s*\n' , '\n ' , clean )
433- clean = clean .strip ()
434-
435- return clean
436-
437- def _replace_thinking_tags (self , answer : str ) -> str :
438- """Replace thinking tags with processing text."""
439- clean = re .sub (r"\033\[[0-9;]*m" , "" , answer ).replace ("LCARS: " , "" ).strip ()
440- while "<think>" in clean and "</think>" in clean :
441- start = clean .find ("<think>" )
442- end = clean .find ("</think>" ) + len ("</think>" )
443- clean = clean [:start ] + self .thinking_text + clean [end :]
444- return clean .strip ()
445-
446390 def _update_history (self , question : str , answer : str ):
447391 """Update conversation history."""
448392 self .conversation_history .append ({"question" : question , "answer" : answer })
0 commit comments