fix: Convert table cells to strings for compatibility with TableReader (#3762)

sjrl · web-flow · commit 659020fcac70 · 2023-01-09T10:42:11.000+01:00
* Add table = table.astype(str) to make sure cells are converted into to strings to be compatible witht the TableReader

* Turn more strings into ints

* Make sure answer text is always a string.
diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py
@@ -247,7 +247,8 @@ def __init__(
         self.device = device
 
     def _predict_tapas(self, inputs: BatchEncoding, document: Document) -> Answer:
-        table: pd.DataFrame = document.content
+        orig_table: pd.DataFrame = document.content
+        string_table = orig_table.astype(str)
 
         # Forward query and table through model and convert logits to predictions
         with torch.inference_mode():
@@ -270,7 +271,7 @@ def _predict_tapas(self, inputs: BatchEncoding, document: Document) -> Answer:
         current_answer_coordinates = predicted_answer_coordinates[0]
         current_answer_cells = []
         for coordinate in current_answer_coordinates:
-            current_answer_cells.append(table.iat[coordinate])
+            current_answer_cells.append(string_table.iat[coordinate])
 
         # Get aggregation operator
         if self.model.config.aggregation_labels is not None:
@@ -286,13 +287,13 @@ def _predict_tapas(self, inputs: BatchEncoding, document: Document) -> Answer:
         else:
             answer_str = self._aggregate_answers(current_aggregation_operator, current_answer_cells)
 
-        answer_offsets = _calculate_answer_offsets(current_answer_coordinates, document.content)
+        answer_offsets = _calculate_answer_offsets(current_answer_coordinates, string_table)
 
         answer = Answer(
             answer=answer_str,
             type="extractive",
             score=current_score,
-            context=document.content,
+            context=string_table,
             offsets_in_document=answer_offsets,
             offsets_in_context=answer_offsets,
             document_id=document.id,
@@ -373,6 +374,7 @@ def predict(self, query: str, documents: List[Document], top_k: int) -> Dict:
         table_documents = _check_documents(documents)
         for document in table_documents:
             table: pd.DataFrame = document.content
+            table = table.astype(str)
             model_inputs = self.tokenizer(
                 table=table, queries=query, max_length=self.max_seq_len, return_tensors="pt", truncation=True
             )
@@ -418,7 +420,8 @@ def __init__(
         self.return_no_answer = return_no_answer
 
     def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tuple[List[Answer], float]:
-        table: pd.DataFrame = document.content
+        orig_table: pd.DataFrame = document.content
+        string_table = orig_table.astype(str)
 
         # Forward pass through model
         with torch.inference_mode():
@@ -494,8 +497,8 @@ def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tu
         answers = []
         for answer_span_idx in top_k_answer_spans.indices:
             current_answer_span = possible_answer_spans[answer_span_idx]
-            answer_str = table.iat[current_answer_span[:2]]
-            answer_offsets = _calculate_answer_offsets([current_answer_span[:2]], document.content)
+            answer_str = string_table.iat[current_answer_span[:2]]
+            answer_offsets = _calculate_answer_offsets([current_answer_span[:2]], string_table)
             # As the general table score is more important for the final score, it is double weighted.
             current_score = ((2 * table_relevancy_prob) + span_logits_softmax[0, answer_span_idx].item()) / 3
 
@@ -504,11 +507,11 @@ def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tu
                     answer=answer_str,
                     type="extractive",
                     score=current_score,
-                    context=document.content,
+                    context=string_table,
                     offsets_in_document=answer_offsets,
                     offsets_in_context=answer_offsets,
                     document_id=document.id,
-                    meta={"aggregation_operator": "NONE", "answer_cells": table.iat[current_answer_span[:2]]},
+                    meta={"aggregation_operator": "NONE", "answer_cells": string_table.iat[current_answer_span[:2]]},
                 )
             )
 
@@ -520,6 +523,7 @@ def predict(self, query: str, documents: List[Document], top_k: int) -> Dict:
         table_documents = _check_documents(documents)
         for document in table_documents:
             table: pd.DataFrame = document.content
+            table = table.astype(str)
             model_inputs = self.tokenizer(
                 table=table, queries=query, max_length=self.max_seq_len, return_tensors="pt", truncation=True
             )
@@ -702,8 +706,8 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
         for document in table_documents:
             # Create row and column representations
             table: pd.DataFrame = document.content
-            table = table.astype(str)
-            row_reps, column_reps = self._create_row_column_representations(table)
+            string_table = table.astype(str)
+            row_reps, column_reps = self._create_row_column_representations(string_table)
 
             # Get row logits
             row_inputs = self.row_tokenizer(
@@ -742,14 +746,14 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
                     current_cell_score = float(row_score + col_score)
                     cell_scores_table[-1].append(current_cell_score)
 
-                    answer_str = table.iloc[row_idx, col_idx]
-                    answer_offsets = self._calculate_answer_offsets(row_idx, col_idx, table)
+                    answer_str = string_table.iloc[row_idx, col_idx]
+                    answer_offsets = self._calculate_answer_offsets(row_idx, col_idx, string_table)
                     current_answers.append(
                         Answer(
                             answer=answer_str,
                             type="extractive",
                             score=current_cell_score,
-                            context=table,
+                            context=string_table,
                             offsets_in_document=[answer_offsets],
                             offsets_in_context=[answer_offsets],
                             document_id=document.id,
diff --git a/test/nodes/test_table_reader.py b/test/nodes/test_table_reader.py
@@ -11,8 +11,8 @@
 def table1():
     data = {
         "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
-        "age": ["58", "47", "60"],
-        "number of movies": ["87", "53", "69"],
+        "age": [58, 47, 60],
+        "number of movies": [87, 53, 69],
         "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
     }
     return pd.DataFrame(data)
@@ -22,8 +22,8 @@ def table1():
 def table2():
     data = {
         "actors": ["chris pratt", "gal gadot", "oprah winfrey"],
-        "age": ["45", "36", "65"],
-        "number of movies": ["49", "34", "5"],
+        "age": [45, 36, 65],
+        "number of movies": [49, 34, 5],
         "date of birth": ["12 january 1975", "5 april 1980", "15 september 1960"],
     }
     return pd.DataFrame(data)