@@ -40,17 +40,25 @@ def _calc_no_answer(
4040 # the most significant difference between scores.
4141 # Most significant difference: a model switching from predicting an answer to "no answer" (or vice versa).
4242 # No_ans_gap is a list of this most significant difference per document
43- no_ans_gap_array = np .array (no_ans_gaps )
44- max_no_ans_gap = np .max (no_ans_gap_array )
45- # case 1: all passages "no answer" as top score
46- # max_no_ans_gap is negative, so it increases best pos score
47- # case 2: at least one passage predicts an answer (positive no_ans_gap)
48- no_ans_score = best_score_answer - max_no_ans_gap
43+
44+ # If there is not even one predicted answer, we return a no_answer with score 1.0
45+ if best_score_answer == 0 and len (no_ans_gaps ) == 0 :
46+ no_ans_score = 1024.0
47+ no_ans_score_scaled = 1.0
48+ max_no_ans_gap = 1024.0
49+ else :
50+ no_ans_gap_array = np .array (no_ans_gaps )
51+ max_no_ans_gap = np .max (no_ans_gap_array )
52+ # case 1: all passages "no answer" as top score
53+ # max_no_ans_gap is negative, so it increases best pos score
54+ # case 2: at least one passage predicts an answer (positive no_ans_gap)
55+ no_ans_score = best_score_answer - max_no_ans_gap
56+ no_ans_score_scaled = float (expit (np .asarray (no_ans_score ) / 8 ))
4957
5058 no_ans_prediction = Answer (
5159 answer = "" ,
5260 type = "extractive" ,
53- score = float ( expit ( np . asarray ( no_ans_score ) / 8 ))
61+ score = no_ans_score_scaled
5462 if use_confidence_scores
5563 else no_ans_score , # just a pseudo prob for now or old score,
5664 context = None ,
@@ -80,10 +88,27 @@ def add_doc_meta_data_to_answer(documents: List[Document], answer):
8088 def run (self , query : str , documents : List [Document ], top_k : Optional [int ] = None , labels : Optional [MultiLabel ] = None , add_isolated_node_eval : bool = False ): # type: ignore
8189 self .query_count += 1
8290 predict = self .timing (self .predict , "query_time" )
91+ # Remove empty text documents before making predictions
92+ documents = [d for d in documents if not isinstance (d .content , str ) or d .content .strip () != "" ]
8393 if documents :
8494 results = predict (query = query , documents = documents , top_k = top_k )
8595 else :
86- results = {"answers" : []}
96+ if hasattr (self , "return_no_answers" ) and self .return_no_answers :
97+ no_ans_prediction = Answer (
98+ answer = "" ,
99+ type = "extractive" ,
100+ score = 1.0
101+ if hasattr (self , "use_confidence_scores" ) and self .use_confidence_scores
102+ else 1024.0 , # just a pseudo prob for now or old score,
103+ context = None ,
104+ offsets_in_context = [Span (start = 0 , end = 0 )],
105+ offsets_in_document = [Span (start = 0 , end = 0 )],
106+ document_id = None ,
107+ meta = None ,
108+ )
109+ results = {"answers" : [no_ans_prediction ]}
110+ else :
111+ results = {"answers" : []}
87112
88113 # Add corresponding document_name and more meta data, if an answer contains the document_id
89114 results ["answers" ] = [
@@ -92,7 +117,9 @@ def run(self, query: str, documents: List[Document], top_k: Optional[int] = None
92117
93118 # run evaluation with labels as node inputs
94119 if add_isolated_node_eval and labels is not None :
95- relevant_documents = {label .document .id : label .document for label in labels .labels }.values ()
120+ relevant_documents = [label .document for label in labels .labels ]
121+ # Filter out empty documents
122+ relevant_documents = [d for d in relevant_documents if d .content .strip () != "" ]
96123 results_label_input = predict (query = query , documents = relevant_documents , top_k = top_k )
97124
98125 # Add corresponding document_name and more meta data, if an answer contains the document_id
@@ -113,6 +140,14 @@ def run_batch( # type: ignore
113140 add_isolated_node_eval : bool = False ,
114141 ):
115142 self .query_count += len (queries )
143+
144+ # Remove empty documents before making predictions
145+ if len (documents ) > 0 :
146+ if isinstance (documents [0 ], Document ):
147+ documents = [d for d in documents if not isinstance (d .content , str ) or d .content .strip () != "" ] # type: ignore[union-attr, assignment]
148+ else :
149+ documents = [[d for d in docs_per_query if not isinstance (d .content , str ) or d .content .strip () != "" ] for docs_per_query in documents ] # type: ignore[union-attr]
150+
116151 if not documents :
117152 return {"answers" : []}, "output_1"
118153
@@ -138,7 +173,11 @@ def run_batch( # type: ignore
138173 if add_isolated_node_eval and labels is not None :
139174 relevant_documents = []
140175 for labelx in labels :
141- relevant_documents .append ([label .document for label in labelx .labels ])
176+ # Filter out empty documents
177+ relevant_docs_labelx = [
178+ label .document for label in labelx .labels if label .document .content .strip () != ""
179+ ]
180+ relevant_documents .append (relevant_docs_labelx )
142181 results_label_input = predict_batch (queries = queries , documents = relevant_documents , top_k = top_k )
143182
144183 # Add corresponding document_name and more meta data, if an answer contains the document_id
0 commit comments