Skip to content

Commit 8fbccbd

Browse files
authored
fix: handle Documents containing dataframes in Multilabel constructor (#3237)
* format * fix docs
1 parent 19af6f4 commit 8fbccbd

File tree

3 files changed

+33
-19
lines changed

3 files changed

+33
-19
lines changed

docs/_src/api/api/primitives.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,9 +144,9 @@ class Span()
144144

145145
#### end
146146

147-
Defining a sequence of characters (Text span) or cells (Table span) via start and end index.
147+
Defining a sequence of characters (Text span) or cells (Table span) via start and end index.
148148

149-
For extractive QA: Character where answer starts/ends
149+
For extractive QA: Character where answer starts/ends
150150
For TableQA: Cell where the answer starts/ends (counted from top left to bottom right of table)
151151

152152
**Arguments**:
@@ -174,24 +174,24 @@ For example, it's used within some Nodes like the Reader, but also in the REST A
174174
**Arguments**:
175175

176176
- `answer`: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string.
177-
- `type`: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
178-
(i.e. we can locate an exact answer string in one of the documents) or from a generative model
177+
- `type`: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
178+
(i.e. we can locate an exact answer string in one of the documents) or from a generative model
179179
(i.e. no pointer to a specific document, no offsets ...).
180180
- `score`: The relevance score of the Answer determined by a model (e.g. Reader or Generator).
181181
In the range of [0,1], where 1 means extremely relevant.
182182
- `context`: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...)
183183
- `offsets_in_document`: List of `Span` objects with start and end positions of the answer **in the
184184
document** (as stored in the document store).
185-
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
185+
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
186186
For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
187187
(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
188188
- `offsets_in_context`: List of `Span` objects with start and end positions of the answer **in the
189189
context** (i.e. the surrounding text/table of a certain window size).
190-
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
190+
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
191191
For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
192192
(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
193193
- `document_id`: ID of the document that the answer was located it (if any)
194-
- `meta`: Dict that can be used to associate any kind of custom meta data with the answer.
194+
- `meta`: Dict that can be used to associate any kind of custom meta data with the answer.
195195
In extractive QA, this will carry the meta data of the document where the answer was found.
196196

197197
<a id="schema.Answer.__lt__"></a>

haystack/schema.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -293,10 +293,10 @@ class Span:
293293
start: int
294294
end: int
295295
"""
296-
Defining a sequence of characters (Text span) or cells (Table span) via start and end index.
297-
For extractive QA: Character where answer starts/ends
296+
Defining a sequence of characters (Text span) or cells (Table span) via start and end index.
297+
For extractive QA: Character where answer starts/ends
298298
For TableQA: Cell where the answer starts/ends (counted from top left to bottom right of table)
299-
299+
300300
:param start: Position where the span starts
301301
:param end: Position where the spand ends
302302
"""
@@ -318,24 +318,24 @@ class Answer:
318318
For example, it's used within some Nodes like the Reader, but also in the REST API.
319319
320320
:param answer: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string.
321-
:param type: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
322-
(i.e. we can locate an exact answer string in one of the documents) or from a generative model
323-
(i.e. no pointer to a specific document, no offsets ...).
321+
:param type: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
322+
(i.e. we can locate an exact answer string in one of the documents) or from a generative model
323+
(i.e. no pointer to a specific document, no offsets ...).
324324
:param score: The relevance score of the Answer determined by a model (e.g. Reader or Generator).
325325
In the range of [0,1], where 1 means extremely relevant.
326326
:param context: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...)
327327
:param offsets_in_document: List of `Span` objects with start and end positions of the answer **in the
328328
document** (as stored in the document store).
329-
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
329+
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
330330
For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
331-
(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
331+
(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
332332
:param offsets_in_context: List of `Span` objects with start and end positions of the answer **in the
333333
context** (i.e. the surrounding text/table of a certain window size).
334-
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
334+
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
335335
For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
336-
(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
336+
(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
337337
:param document_id: ID of the document that the answer was located it (if any)
338-
:param meta: Dict that can be used to associate any kind of custom meta data with the answer.
338+
:param meta: Dict that can be used to associate any kind of custom meta data with the answer.
339339
In extractive QA, this will carry the meta data of the document where the answer was found.
340340
"""
341341

@@ -679,7 +679,7 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ
679679
# Hence, we exclude them here as well.
680680

681681
self.document_ids = [l.document.id for l in self.labels if not l.no_answer]
682-
self.contexts = [l.document.content for l in self.labels if not l.no_answer]
682+
self.contexts = [str(l.document.content) for l in self.labels if not l.no_answer]
683683

684684
def _aggregate_labels(self, key, must_be_single_value=True) -> List[Any]:
685685
if any(isinstance(getattr(l, key), dict) for l in self.labels):

test/others/test_schema.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from haystack.schema import Document, Label, Answer, Span, MultiLabel, SpeechDocument, SpeechAnswer
22
import pytest
33
import numpy as np
4+
import pandas as pd
45

56
from ..conftest import SAMPLES_PATH
67

@@ -401,6 +402,19 @@ def test_multilabel_id():
401402
assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605"
402403

403404

405+
def test_multilabel_with_doc_containing_dataframes():
406+
label = Label(
407+
query="A question",
408+
document=Document(content=pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})),
409+
is_correct_answer=True,
410+
is_correct_document=True,
411+
origin="gold-label",
412+
answer=Answer(answer="answer 1"),
413+
)
414+
assert len(MultiLabel(labels=[label]).contexts) == 1
415+
assert type(MultiLabel(labels=[label]).contexts[0]) is str
416+
417+
404418
def test_serialize_speech_document():
405419
speech_doc = SpeechDocument(
406420
id=12345,

0 commit comments

Comments
 (0)