fix: handle Documents containing dataframes in Multilabel constructor (#3237)

masci · web-flow · commit 8fbccbda8232 · 2022-09-19T14:59:20.000+02:00
* format

* fix docs
diff --git a/docs/_src/api/api/primitives.md b/docs/_src/api/api/primitives.md
@@ -144,9 +144,9 @@ class Span()
 
 #### end
 
-Defining a sequence of characters (Text span) or cells (Table span) via start and end index. 
+Defining a sequence of characters (Text span) or cells (Table span) via start and end index.
 
-For extractive QA: Character where answer starts/ends  
+For extractive QA: Character where answer starts/ends
 For TableQA: Cell where the answer starts/ends (counted from top left to bottom right of table)
 
 **Arguments**:
@@ -174,24 +174,24 @@ For example, it's used within some Nodes like the Reader, but also in the REST A
 **Arguments**:
 
 - `answer`: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string.
-- `type`: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model 
-(i.e. we can locate an exact answer string in one of the documents) or from a generative model 
+- `type`: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
+(i.e. we can locate an exact answer string in one of the documents) or from a generative model
 (i.e. no pointer to a specific document, no offsets ...).
 - `score`: The relevance score of the Answer determined by a model (e.g. Reader or Generator).
 In the range of [0,1], where 1 means extremely relevant.
 - `context`: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...)
 - `offsets_in_document`: List of `Span` objects with start and end positions of the answer **in the
 document** (as stored in the document store).
-For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start 
+For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
 For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
 (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
 - `offsets_in_context`: List of `Span` objects with start and end positions of the answer **in the
 context** (i.e. the surrounding text/table of a certain window size).
-For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start 
+For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
 For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
 (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
 - `document_id`: ID of the document that the answer was located it (if any)
-- `meta`: Dict that can be used to associate any kind of custom meta data with the answer. 
+- `meta`: Dict that can be used to associate any kind of custom meta data with the answer.
 In extractive QA, this will carry the meta data of the document where the answer was found.
 
 <a id="schema.Answer.__lt__"></a>
diff --git a/haystack/schema.py b/haystack/schema.py
@@ -293,10 +293,10 @@ class Span:
     start: int
     end: int
     """
-    Defining a sequence of characters (Text span) or cells (Table span) via start and end index. 
-    For extractive QA: Character where answer starts/ends  
+    Defining a sequence of characters (Text span) or cells (Table span) via start and end index.
+    For extractive QA: Character where answer starts/ends
     For TableQA: Cell where the answer starts/ends (counted from top left to bottom right of table)
-    
+
     :param start: Position where the span starts
     :param end:  Position where the spand ends
     """
@@ -318,24 +318,24 @@ class Answer:
     For example, it's used within some Nodes like the Reader, but also in the REST API.
 
     :param answer: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string.
-    :param type: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model 
-                 (i.e. we can locate an exact answer string in one of the documents) or from a generative model 
-                 (i.e. no pointer to a specific document, no offsets ...). 
+    :param type: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
+                 (i.e. we can locate an exact answer string in one of the documents) or from a generative model
+                 (i.e. no pointer to a specific document, no offsets ...).
     :param score: The relevance score of the Answer determined by a model (e.g. Reader or Generator).
                   In the range of [0,1], where 1 means extremely relevant.
     :param context: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...)
     :param offsets_in_document: List of `Span` objects with start and end positions of the answer **in the
                                 document** (as stored in the document store).
-                                For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start 
+                                For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
                                 For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
-                                (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here) 
+                                (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
     :param offsets_in_context: List of `Span` objects with start and end positions of the answer **in the
                                 context** (i.e. the surrounding text/table of a certain window size).
-                                For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start 
+                                For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
                                 For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
-                                (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here) 
+                                (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
     :param document_id: ID of the document that the answer was located it (if any)
-    :param meta: Dict that can be used to associate any kind of custom meta data with the answer. 
+    :param meta: Dict that can be used to associate any kind of custom meta data with the answer.
                  In extractive QA, this will carry the meta data of the document where the answer was found.
     """
 
@@ -679,7 +679,7 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ
         # Hence, we exclude them here as well.
 
         self.document_ids = [l.document.id for l in self.labels if not l.no_answer]
-        self.contexts = [l.document.content for l in self.labels if not l.no_answer]
+        self.contexts = [str(l.document.content) for l in self.labels if not l.no_answer]
 
     def _aggregate_labels(self, key, must_be_single_value=True) -> List[Any]:
         if any(isinstance(getattr(l, key), dict) for l in self.labels):
diff --git a/test/others/test_schema.py b/test/others/test_schema.py
@@ -1,6 +1,7 @@
 from haystack.schema import Document, Label, Answer, Span, MultiLabel, SpeechDocument, SpeechAnswer
 import pytest
 import numpy as np
+import pandas as pd
 
 from ..conftest import SAMPLES_PATH
 
@@ -401,6 +402,19 @@ def test_multilabel_id():
     assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605"
 
 
+def test_multilabel_with_doc_containing_dataframes():
+    label = Label(
+        query="A question",
+        document=Document(content=pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})),
+        is_correct_answer=True,
+        is_correct_document=True,
+        origin="gold-label",
+        answer=Answer(answer="answer 1"),
+    )
+    assert len(MultiLabel(labels=[label]).contexts) == 1
+    assert type(MultiLabel(labels=[label]).contexts[0]) is str
+
+
 def test_serialize_speech_document():
     speech_doc = SpeechDocument(
         id=12345,