@@ -247,7 +247,8 @@ def __init__(
247247 self .device = device
248248
249249 def _predict_tapas (self , inputs : BatchEncoding , document : Document ) -> Answer :
250- table : pd .DataFrame = document .content
250+ orig_table : pd .DataFrame = document .content
251+ string_table = orig_table .astype (str )
251252
252253 # Forward query and table through model and convert logits to predictions
253254 with torch .inference_mode ():
@@ -270,7 +271,7 @@ def _predict_tapas(self, inputs: BatchEncoding, document: Document) -> Answer:
270271 current_answer_coordinates = predicted_answer_coordinates [0 ]
271272 current_answer_cells = []
272273 for coordinate in current_answer_coordinates :
273- current_answer_cells .append (table .iat [coordinate ])
274+ current_answer_cells .append (string_table .iat [coordinate ])
274275
275276 # Get aggregation operator
276277 if self .model .config .aggregation_labels is not None :
@@ -286,13 +287,13 @@ def _predict_tapas(self, inputs: BatchEncoding, document: Document) -> Answer:
286287 else :
287288 answer_str = self ._aggregate_answers (current_aggregation_operator , current_answer_cells )
288289
289- answer_offsets = _calculate_answer_offsets (current_answer_coordinates , document . content )
290+ answer_offsets = _calculate_answer_offsets (current_answer_coordinates , string_table )
290291
291292 answer = Answer (
292293 answer = answer_str ,
293294 type = "extractive" ,
294295 score = current_score ,
295- context = document . content ,
296+ context = string_table ,
296297 offsets_in_document = answer_offsets ,
297298 offsets_in_context = answer_offsets ,
298299 document_id = document .id ,
@@ -373,6 +374,7 @@ def predict(self, query: str, documents: List[Document], top_k: int) -> Dict:
373374 table_documents = _check_documents (documents )
374375 for document in table_documents :
375376 table : pd .DataFrame = document .content
377+ table = table .astype (str )
376378 model_inputs = self .tokenizer (
377379 table = table , queries = query , max_length = self .max_seq_len , return_tensors = "pt" , truncation = True
378380 )
@@ -418,7 +420,8 @@ def __init__(
418420 self .return_no_answer = return_no_answer
419421
420422 def _predict_tapas_scored (self , inputs : BatchEncoding , document : Document ) -> Tuple [List [Answer ], float ]:
421- table : pd .DataFrame = document .content
423+ orig_table : pd .DataFrame = document .content
424+ string_table = orig_table .astype (str )
422425
423426 # Forward pass through model
424427 with torch .inference_mode ():
@@ -494,8 +497,8 @@ def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tu
494497 answers = []
495498 for answer_span_idx in top_k_answer_spans .indices :
496499 current_answer_span = possible_answer_spans [answer_span_idx ]
497- answer_str = table .iat [current_answer_span [:2 ]]
498- answer_offsets = _calculate_answer_offsets ([current_answer_span [:2 ]], document . content )
500+ answer_str = string_table .iat [current_answer_span [:2 ]]
501+ answer_offsets = _calculate_answer_offsets ([current_answer_span [:2 ]], string_table )
499502 # As the general table score is more important for the final score, it is double weighted.
500503 current_score = ((2 * table_relevancy_prob ) + span_logits_softmax [0 , answer_span_idx ].item ()) / 3
501504
@@ -504,11 +507,11 @@ def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tu
504507 answer = answer_str ,
505508 type = "extractive" ,
506509 score = current_score ,
507- context = document . content ,
510+ context = string_table ,
508511 offsets_in_document = answer_offsets ,
509512 offsets_in_context = answer_offsets ,
510513 document_id = document .id ,
511- meta = {"aggregation_operator" : "NONE" , "answer_cells" : table .iat [current_answer_span [:2 ]]},
514+ meta = {"aggregation_operator" : "NONE" , "answer_cells" : string_table .iat [current_answer_span [:2 ]]},
512515 )
513516 )
514517
@@ -520,6 +523,7 @@ def predict(self, query: str, documents: List[Document], top_k: int) -> Dict:
520523 table_documents = _check_documents (documents )
521524 for document in table_documents :
522525 table : pd .DataFrame = document .content
526+ table = table .astype (str )
523527 model_inputs = self .tokenizer (
524528 table = table , queries = query , max_length = self .max_seq_len , return_tensors = "pt" , truncation = True
525529 )
@@ -702,8 +706,8 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
702706 for document in table_documents :
703707 # Create row and column representations
704708 table : pd .DataFrame = document .content
705- table = table .astype (str )
706- row_reps , column_reps = self ._create_row_column_representations (table )
709+ string_table = table .astype (str )
710+ row_reps , column_reps = self ._create_row_column_representations (string_table )
707711
708712 # Get row logits
709713 row_inputs = self .row_tokenizer (
@@ -742,14 +746,14 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
742746 current_cell_score = float (row_score + col_score )
743747 cell_scores_table [- 1 ].append (current_cell_score )
744748
745- answer_str = table .iloc [row_idx , col_idx ]
746- answer_offsets = self ._calculate_answer_offsets (row_idx , col_idx , table )
749+ answer_str = string_table .iloc [row_idx , col_idx ]
750+ answer_offsets = self ._calculate_answer_offsets (row_idx , col_idx , string_table )
747751 current_answers .append (
748752 Answer (
749753 answer = answer_str ,
750754 type = "extractive" ,
751755 score = current_cell_score ,
752- context = table ,
756+ context = string_table ,
753757 offsets_in_document = [answer_offsets ],
754758 offsets_in_context = [answer_offsets ],
755759 document_id = document .id ,
0 commit comments