@@ -119,15 +119,9 @@ def post_retrieve(
119119 info = None ,
120120 search_tool_memory : bool = False ,
121121 tool_mem_top_k : int = 6 ,
122- dedup : str | None = None ,
123122 plugin = False ,
124123 ):
125- if dedup == "no" :
126- deduped = retrieved_results
127- elif dedup == "sim" :
128- deduped = self ._deduplicate_similar_results (retrieved_results )
129- else :
130- deduped = self ._deduplicate_results (retrieved_results )
124+ deduped = self ._deduplicate_results (retrieved_results )
131125 final_results = self ._sort_and_trim (
132126 deduped , top_k , plugin , search_tool_memory , tool_mem_top_k
133127 )
@@ -147,7 +141,6 @@ def search(
147141 user_name : str | None = None ,
148142 search_tool_memory : bool = False ,
149143 tool_mem_top_k : int = 6 ,
150- dedup : str | None = None ,
151144 ** kwargs ,
152145 ) -> list [TextualMemoryItem ]:
153146 """
@@ -180,11 +173,7 @@ def search(
180173 if kwargs .get ("plugin" , False ):
181174 logger .info (f"[SEARCH] Retrieve from plugin: { query } " )
182175 retrieved_results = self ._retrieve_simple (
183- query = query ,
184- top_k = top_k ,
185- search_filter = search_filter ,
186- user_name = user_name ,
187- dedup = dedup ,
176+ query = query , top_k = top_k , search_filter = search_filter , user_name = user_name
188177 )
189178 else :
190179 retrieved_results = self .retrieve (
@@ -213,7 +202,6 @@ def search(
213202 plugin = kwargs .get ("plugin" , False ),
214203 search_tool_memory = search_tool_memory ,
215204 tool_mem_top_k = tool_mem_top_k ,
216- dedup = None if kwargs .get ("plugin" , False ) and dedup == "sim" else dedup ,
217205 )
218206
219207 logger .info (f"[SEARCH] Done. Total { len (final_results )} results." )
@@ -296,50 +284,6 @@ def _parse_task(
296284
297285 return parsed_goal , query_embedding , context , query
298286
299- @timed
300- def _retrieve_simple (
301- self ,
302- query : str ,
303- top_k : int ,
304- search_filter : dict | None = None ,
305- user_name : str | None = None ,
306- dedup : str | None = None ,
307- ** kwargs ,
308- ):
309- """Retrieve from by keywords and embedding"""
310- query_words = []
311- if self .tokenizer :
312- query_words = self .tokenizer .tokenize_mixed (query )
313- else :
314- query_words = query .strip ().split ()
315- query_words = [query , * query_words ]
316- logger .info (f"[SIMPLESEARCH] Query words: { query_words } " )
317- query_embeddings = self .embedder .embed (query_words )
318-
319- items = self .graph_retriever .retrieve_from_mixed (
320- top_k = top_k * 2 ,
321- memory_scope = None ,
322- query_embedding = query_embeddings ,
323- search_filter = search_filter ,
324- user_name = user_name ,
325- use_fast_graph = self .use_fast_graph ,
326- )
327- logger .info (f"[SIMPLESEARCH] Items count: { len (items )} " )
328- documents = [getattr (item , "memory" , "" ) for item in items ]
329- documents_embeddings = self .embedder .embed (documents )
330- similarity_matrix = cosine_similarity_matrix (documents_embeddings )
331- selected_indices , _ = find_best_unrelated_subgroup (documents , similarity_matrix )
332- selected_items = [items [i ] for i in selected_indices ]
333- logger .info (
334- f"[SIMPLESEARCH] after unrelated subgroup selection items count: { len (selected_items )} "
335- )
336- return self .reranker .rerank (
337- query = query ,
338- query_embedding = query_embeddings [0 ],
339- graph_results = selected_items ,
340- top_k = top_k ,
341- )
342-
343287 @timed
344288 def _retrieve_paths (
345289 self ,
@@ -723,17 +667,14 @@ def _retrieve_simple(
723667 user_name = user_name ,
724668 )
725669 logger .info (f"[SIMPLESEARCH] Items count: { len (items )} " )
726- if dedup == "no" :
727- selected_items = items
728- else :
729- documents = [getattr (item , "memory" , "" ) for item in items ]
730- documents_embeddings = self .embedder .embed (documents )
731- similarity_matrix = cosine_similarity_matrix (documents_embeddings )
732- selected_indices , _ = find_best_unrelated_subgroup (documents , similarity_matrix )
733- selected_items = [items [i ] for i in selected_indices ]
734- logger .info (
735- f"[SIMPLESEARCH] after unrelated subgroup selection items count: { len (selected_items )} "
736- )
670+ documents = [getattr (item , "memory" , "" ) for item in items ]
671+ documents_embeddings = self .embedder .embed (documents )
672+ similarity_matrix = cosine_similarity_matrix (documents_embeddings )
673+ selected_indices , _ = find_best_unrelated_subgroup (documents , similarity_matrix )
674+ selected_items = [items [i ] for i in selected_indices ]
675+ logger .info (
676+ f"[SIMPLESEARCH] after unrelated subgroup selection items count: { len (selected_items )} "
677+ )
737678 return self .reranker .rerank (
738679 query = query ,
739680 query_embedding = query_embeddings [0 ],
@@ -750,26 +691,6 @@ def _deduplicate_results(self, results):
750691 deduped [item .memory ] = (item , score )
751692 return list (deduped .values ())
752693
753- @timed
754- def _deduplicate_similar_results (
755- self , results : list [tuple [TextualMemoryItem , float ]], similarity_threshold : float = 0.85
756- ):
757- """Deduplicate results by semantic similarity while keeping higher scores."""
758- if len (results ) <= 1 :
759- return results
760-
761- sorted_results = sorted (results , key = lambda pair : pair [1 ], reverse = True )
762- documents = [getattr (item , "memory" , "" ) for item , _ in sorted_results ]
763- embeddings = self .embedder .embed (documents )
764- similarity_matrix = cosine_similarity_matrix (embeddings )
765-
766- selected_indices : list [int ] = []
767- for i in range (len (sorted_results )):
768- if all (similarity_matrix [i ][j ] <= similarity_threshold for j in selected_indices ):
769- selected_indices .append (i )
770-
771- return [sorted_results [i ] for i in selected_indices ]
772-
773694 @timed
774695 def _sort_and_trim (
775696 self , results , top_k , plugin = False , search_tool_memory = False , tool_mem_top_k = 6
0 commit comments