@@ -227,9 +227,9 @@ def get_chat_context(history_pairs: List[Tuple[str, str, str]], summary: str) ->
227227)
228228
229229judge_prompt = PromptTemplate (
230- input_variables = ["query" ,"context_snippet " ],
230+ input_variables = ["query" ,"context_snippets " ],
231231 template = """
232- Return JSON only: {{"topic_match":"strong|medium|absolutely_not_possible","sufficient":true/false,"why":"short","alternative":"<anchored question or empty>"}}
232+ Return JSON only: {{"judgments":[{{"index": <index of snippet>, " topic_match":"strong|medium|absolutely_not_possible","sufficient":true/false,"why":"short","alternative":"<anchored question or empty>"}}] }}
233233
234234Guidance:
235235- strong: Large facts about the query can be found and more supporting facts about the topic so A strong answer can be formed about the query from the context.
@@ -239,11 +239,13 @@ def get_chat_context(history_pairs: List[Tuple[str, str, str]], summary: str) ->
239239sufficient is True when topic_match is strong or weak with some similarity to query or topic
240240Sufficient is False otherwise
241241
242+ For each provided context snippet, make a judgment. The `index` in the output JSON should correspond to the order of the snippets in the input.
243+
242244Query:
243245{query}
244246
245- Context (short excerpts) :
246- {context_snippet }
247+ Context Snippets :
248+ {context_snippets }
247249"""
248250)
249251
@@ -345,27 +347,53 @@ def judge_sufficiency(query: str, candidates: List[Dict[str, Any]], judge_llm: C
345347 topic_match_order = {"strong" : 3 , "medium" : 2 , "absolutely_not_possible" : 1 }
346348
347349 logging .info (f"len of candidates { len (candidates )} " )
348- for c in candidates : # inspect up to 12. Iterate through all candidates initially
349- snippet = f"Source: { c ['meta' ].get ('doc_name' ,'unknown' )} \\ nExcerpt: { c ['text' ]} "
350- prompt = judge_prompt .format (query = query , context_snippet = snippet )
351350
352- resp = judge_llm .invoke ([HumanMessage (content = prompt )]).content
351+ # Prepare snippets for batch judging
352+ snippets_for_llm = []
353+ for idx , c in enumerate (candidates ):
354+ snippet_text = f"Source: { c ['meta' ].get ('doc_name' , 'unknown' )} \n Excerpt: { c ['text' ]} "
355+ snippets_for_llm .append (f"Snippet { idx } :\n { snippet_text } " )
356+
357+ combined_snippets = "\n \n " .join (snippets_for_llm )
358+
359+ prompt = judge_prompt .format (query = query , context_snippets = combined_snippets )
353360
354- try :
355- obj = json .loads (resp [resp .rfind ("{" ):resp .rfind ("}" )+ 1 ])
356- logging .info (obj )
357- topic_match_label = obj .get ("topic_match" , "absolutely_not_possible" )
358- # Store topic_match_score in the chunk's meta for easier sorting
359- c ['meta' ]['topic_match_score' ] = topic_match_order .get (topic_match_label , 0 ) # Default to 0 for unknown/error
360-
361- if obj .get ("sufficient" , False ):
362- qualified_with_scores .append (c ) # Add to qualified list
363- else :
364- followup_chunks_raw .append (c )
365- except Exception :
366- # Fallback based on cross-encoder score if LLM judge fails
367- # Assign a default topic_match_score (e.g., 'medium' equivalent if LLM fails to parse)
368- c ['meta' ]['topic_match_score' ] = topic_match_order .get ("medium" , 0 )
361+ try :
362+ resp = judge_llm .invoke ([HumanMessage (content = prompt )]).content
363+ parsed_judgments = safe_json_parse (resp )
364+ if parsed_judgments and "judgments" in parsed_judgments :
365+ for judgment in parsed_judgments ["judgments" ]:
366+ idx = judgment .get ("index" )
367+ if idx is not None and 0 <= idx < len (candidates ):
368+ c = candidates [idx ]
369+ topic_match_label = judgment .get ("topic_match" , "absolutely_not_possible" )
370+ c ['meta' ]['topic_match_score' ] = topic_match_order .get (topic_match_label , 0 )
371+ if judgment .get ("sufficient" , False ):
372+ qualified_with_scores .append (c )
373+ else :
374+ followup_chunks_raw .append (c )
375+ else :
376+ logging .warning (f"[judge_sufficiency] Invalid index in LLM judgment: { judgment } " )
377+ # Fallback for invalid index
378+ if c ["scores" ]["cross" ] > threshold_weak :
379+ qualified_with_scores .append (c )
380+ else :
381+ followup_chunks_raw .append (c )
382+ else :
383+ logging .warning ("[judge_sufficiency] LLM did not return valid batched judgments. Falling back to cross-encoder scores." )
384+ # Fallback based on cross-encoder score if LLM fails to parse or returns no judgments
385+ for c in candidates :
386+ c ['meta' ]['topic_match_score' ] = topic_match_order .get ("medium" , 0 ) # Assign a default topic_match_score
387+ if c ["scores" ]["cross" ] > threshold_weak :
388+ qualified_with_scores .append (c )
389+ else :
390+ followup_chunks_raw .append (c )
391+ except Exception as e :
392+ logging .error (f"[judge_sufficiency] Error during batched LLM judging: { e } " )
393+ logging .exception ("[judge_sufficiency] Full traceback for batched judging error:" )
394+ # Fallback based on cross-encoder score if LLM invocation fails
395+ for c in candidates :
396+ c ['meta' ]['topic_match_score' ] = topic_match_order .get ("medium" , 0 ) # Assign a default topic_match_score
369397 if c ["scores" ]["cross" ] > threshold_weak :
370398 qualified_with_scores .append (c )
371399 else :
0 commit comments