Skip to content

Commit 659020f

Browse files
authored
fix: Convert table cells to strings for compatibility with TableReader (#3762)
* Add table = table.astype(str) to make sure cells are converted into to strings to be compatible witht the TableReader * Turn more strings into ints * Make sure answer text is always a string.
1 parent 93b48bc commit 659020f

File tree

2 files changed

+22
-18
lines changed

2 files changed

+22
-18
lines changed

haystack/nodes/reader/table.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,8 @@ def __init__(
247247
self.device = device
248248

249249
def _predict_tapas(self, inputs: BatchEncoding, document: Document) -> Answer:
250-
table: pd.DataFrame = document.content
250+
orig_table: pd.DataFrame = document.content
251+
string_table = orig_table.astype(str)
251252

252253
# Forward query and table through model and convert logits to predictions
253254
with torch.inference_mode():
@@ -270,7 +271,7 @@ def _predict_tapas(self, inputs: BatchEncoding, document: Document) -> Answer:
270271
current_answer_coordinates = predicted_answer_coordinates[0]
271272
current_answer_cells = []
272273
for coordinate in current_answer_coordinates:
273-
current_answer_cells.append(table.iat[coordinate])
274+
current_answer_cells.append(string_table.iat[coordinate])
274275

275276
# Get aggregation operator
276277
if self.model.config.aggregation_labels is not None:
@@ -286,13 +287,13 @@ def _predict_tapas(self, inputs: BatchEncoding, document: Document) -> Answer:
286287
else:
287288
answer_str = self._aggregate_answers(current_aggregation_operator, current_answer_cells)
288289

289-
answer_offsets = _calculate_answer_offsets(current_answer_coordinates, document.content)
290+
answer_offsets = _calculate_answer_offsets(current_answer_coordinates, string_table)
290291

291292
answer = Answer(
292293
answer=answer_str,
293294
type="extractive",
294295
score=current_score,
295-
context=document.content,
296+
context=string_table,
296297
offsets_in_document=answer_offsets,
297298
offsets_in_context=answer_offsets,
298299
document_id=document.id,
@@ -373,6 +374,7 @@ def predict(self, query: str, documents: List[Document], top_k: int) -> Dict:
373374
table_documents = _check_documents(documents)
374375
for document in table_documents:
375376
table: pd.DataFrame = document.content
377+
table = table.astype(str)
376378
model_inputs = self.tokenizer(
377379
table=table, queries=query, max_length=self.max_seq_len, return_tensors="pt", truncation=True
378380
)
@@ -418,7 +420,8 @@ def __init__(
418420
self.return_no_answer = return_no_answer
419421

420422
def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tuple[List[Answer], float]:
421-
table: pd.DataFrame = document.content
423+
orig_table: pd.DataFrame = document.content
424+
string_table = orig_table.astype(str)
422425

423426
# Forward pass through model
424427
with torch.inference_mode():
@@ -494,8 +497,8 @@ def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tu
494497
answers = []
495498
for answer_span_idx in top_k_answer_spans.indices:
496499
current_answer_span = possible_answer_spans[answer_span_idx]
497-
answer_str = table.iat[current_answer_span[:2]]
498-
answer_offsets = _calculate_answer_offsets([current_answer_span[:2]], document.content)
500+
answer_str = string_table.iat[current_answer_span[:2]]
501+
answer_offsets = _calculate_answer_offsets([current_answer_span[:2]], string_table)
499502
# As the general table score is more important for the final score, it is double weighted.
500503
current_score = ((2 * table_relevancy_prob) + span_logits_softmax[0, answer_span_idx].item()) / 3
501504

@@ -504,11 +507,11 @@ def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tu
504507
answer=answer_str,
505508
type="extractive",
506509
score=current_score,
507-
context=document.content,
510+
context=string_table,
508511
offsets_in_document=answer_offsets,
509512
offsets_in_context=answer_offsets,
510513
document_id=document.id,
511-
meta={"aggregation_operator": "NONE", "answer_cells": table.iat[current_answer_span[:2]]},
514+
meta={"aggregation_operator": "NONE", "answer_cells": string_table.iat[current_answer_span[:2]]},
512515
)
513516
)
514517

@@ -520,6 +523,7 @@ def predict(self, query: str, documents: List[Document], top_k: int) -> Dict:
520523
table_documents = _check_documents(documents)
521524
for document in table_documents:
522525
table: pd.DataFrame = document.content
526+
table = table.astype(str)
523527
model_inputs = self.tokenizer(
524528
table=table, queries=query, max_length=self.max_seq_len, return_tensors="pt", truncation=True
525529
)
@@ -702,8 +706,8 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
702706
for document in table_documents:
703707
# Create row and column representations
704708
table: pd.DataFrame = document.content
705-
table = table.astype(str)
706-
row_reps, column_reps = self._create_row_column_representations(table)
709+
string_table = table.astype(str)
710+
row_reps, column_reps = self._create_row_column_representations(string_table)
707711

708712
# Get row logits
709713
row_inputs = self.row_tokenizer(
@@ -742,14 +746,14 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
742746
current_cell_score = float(row_score + col_score)
743747
cell_scores_table[-1].append(current_cell_score)
744748

745-
answer_str = table.iloc[row_idx, col_idx]
746-
answer_offsets = self._calculate_answer_offsets(row_idx, col_idx, table)
749+
answer_str = string_table.iloc[row_idx, col_idx]
750+
answer_offsets = self._calculate_answer_offsets(row_idx, col_idx, string_table)
747751
current_answers.append(
748752
Answer(
749753
answer=answer_str,
750754
type="extractive",
751755
score=current_cell_score,
752-
context=table,
756+
context=string_table,
753757
offsets_in_document=[answer_offsets],
754758
offsets_in_context=[answer_offsets],
755759
document_id=document.id,

test/nodes/test_table_reader.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
def table1():
1212
data = {
1313
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
14-
"age": ["58", "47", "60"],
15-
"number of movies": ["87", "53", "69"],
14+
"age": [58, 47, 60],
15+
"number of movies": [87, 53, 69],
1616
"date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
1717
}
1818
return pd.DataFrame(data)
@@ -22,8 +22,8 @@ def table1():
2222
def table2():
2323
data = {
2424
"actors": ["chris pratt", "gal gadot", "oprah winfrey"],
25-
"age": ["45", "36", "65"],
26-
"number of movies": ["49", "34", "5"],
25+
"age": [45, 36, 65],
26+
"number of movies": [49, 34, 5],
2727
"date of birth": ["12 january 1975", "5 april 1980", "15 september 1960"],
2828
}
2929
return pd.DataFrame(data)

0 commit comments

Comments
 (0)