deepset-ai
diff --git a/‎haystack/components/converters/azure.py‎
Lines changed: 2 additions & 2 deletions b/‎haystack/components/converters/azure.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎haystack/components/converters/xlsx.py‎
Lines changed: 3 additions & 4 deletions b/‎haystack/components/converters/xlsx.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎haystack/components/joiners/answer_joiner.py‎
Lines changed: 2 additions & 2 deletions b/‎haystack/components/joiners/answer_joiner.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎haystack/components/preprocessors/document_cleaner.py‎
Lines changed: 0 additions & 1 deletion b/‎haystack/components/preprocessors/document_cleaner.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎haystack/dataclasses/answer.py‎
Lines changed: 0 additions & 73 deletions b/‎haystack/dataclasses/answer.py‎
Lines changed: 0 additions & 73 deletions
diff --git a/‎haystack/dataclasses/document.py‎
Lines changed: 9 additions & 32 deletions b/‎haystack/dataclasses/document.py‎
Lines changed: 9 additions & 32 deletions
diff --git a/‎haystack/document_stores/in_memory/document_store.py‎
Lines changed: 2 additions & 22 deletions b/‎haystack/document_stores/in_memory/document_store.py‎
Lines changed: 2 additions & 22 deletions
diff --git a/‎haystack/evaluation/eval_run_result.py‎
Lines changed: 1 addition & 1 deletion b/‎haystack/evaluation/eval_run_result.py‎
Lines changed: 1 addition & 1 deletion
@@ -23,7 +23,7 @@
     from azure.core.credentials import AzureKeyCredential
 
 with LazyImport(message="Run 'pip install pandas'") as pandas_import:
-    import pandas as pd
+    from pandas import DataFrame
 
 
 @component
@@ -306,7 +306,7 @@ def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]
                 table_meta["page"] = table.bounding_regions[0].page_number
 
             # Convert table to CSV
-            table_df = pd.DataFrame(data=table_list)
+            table_df = DataFrame(data=table_list)
             table_content = table_df.to_csv(header=False, index=False, lineterminator="\n")
             converted_tables.append(Document(content=table_content, meta=table_meta))
 
 
@@ -7,17 +7,16 @@
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
-import pandas as pd
-
 from haystack import Document, component, logging
 from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
 from haystack.dataclasses import ByteStream
 from haystack.lazy_imports import LazyImport
 
 logger = logging.getLogger(__name__)
 
-with LazyImport("Run 'pip install openpyxl'") as xlsx_import:
+with LazyImport("Run 'pip install pandas openpyxl'") as pandas_xlsx_import:
     import openpyxl  # pylint: disable=unused-import # the library is used but not directly referenced
+    import pandas as pd
 
 with LazyImport("Run 'pip install tabulate'") as tabulate_import:
     from tabulate import tabulate  # pylint: disable=unused-import # the library is used but not directly referenced
@@ -69,7 +68,7 @@ def __init__(
             If True, the full path of the file is stored in the metadata of the document.
             If False, only the file name is stored.
         """
-        xlsx_import.check()
+        pandas_xlsx_import.check()
         self.table_format = table_format
         if table_format not in ["csv", "markdown"]:
             raise ValueError(f"Unsupported export format: {table_format}. Choose either 'csv' or 'markdown'.")
 
@@ -9,9 +9,9 @@
 
 from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.core.component.types import Variadic
-from haystack.dataclasses.answer import ExtractedAnswer, ExtractedTableAnswer, GeneratedAnswer
+from haystack.dataclasses.answer import ExtractedAnswer, GeneratedAnswer
 
-AnswerType = Union[GeneratedAnswer, ExtractedTableAnswer, ExtractedAnswer]
+AnswerType = Union[GeneratedAnswer, ExtractedAnswer]
 
 logger = logging.getLogger(__name__)
 
 
@@ -134,7 +134,6 @@ def run(self, documents: List[Document]):
             clean_doc = Document(
                 id=doc.id if self.keep_id else "",
                 content=text,
-                dataframe=doc.dataframe,
                 blob=doc.blob,
                 meta=deepcopy(doc.meta),
                 score=doc.score,
 
@@ -2,13 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import io
-import warnings
 from dataclasses import asdict, dataclass, field
 from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
 
-from pandas import DataFrame, read_json
-
 from haystack.core.serialization import default_from_dict, default_to_dict
 from haystack.dataclasses.document import Document
 
@@ -88,75 +84,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "ExtractedAnswer":
         return default_from_dict(cls, data)
 
 
-@dataclass
-class ExtractedTableAnswer:
-    query: str
-    score: float
-    data: Optional[str] = None
-    document: Optional[Document] = None
-    context: Optional[DataFrame] = None
-    document_cells: List["Cell"] = field(default_factory=list)
-    context_cells: List["Cell"] = field(default_factory=list)
-    meta: Dict[str, Any] = field(default_factory=dict)
-
-    def __post_init__(self):
-        msg = "The `ExtractedTableAnswer` dataclass is deprecated and will be removed in Haystack 2.11.0."
-        warnings.warn(msg, DeprecationWarning)
-
-    @dataclass
-    class Cell:
-        row: int
-        column: int
-
-    def to_dict(self) -> Dict[str, Any]:
-        """
-        Serialize the object to a dictionary.
-
-        :returns:
-            Serialized dictionary representation of the object.
-        """
-        document = self.document.to_dict(flatten=False) if self.document is not None else None
-        context = self.context.to_json() if self.context is not None else None
-        document_cells = [asdict(c) for c in self.document_cells]
-        context_cells = [asdict(c) for c in self.context_cells]
-        return default_to_dict(
-            self,
-            data=self.data,
-            query=self.query,
-            document=document,
-            context=context,
-            score=self.score,
-            document_cells=document_cells,
-            context_cells=context_cells,
-            meta=self.meta,
-        )
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "ExtractedTableAnswer":
-        """
-        Deserialize the object from a dictionary.
-
-        :param data:
-            Dictionary representation of the object.
-
-        :returns:
-            Deserialized object.
-        """
-        init_params = data.get("init_parameters", {})
-        if (doc := init_params.get("document")) is not None:
-            data["init_parameters"]["document"] = Document.from_dict(doc)
-
-        if (context := init_params.get("context")) is not None:
-            data["init_parameters"]["context"] = read_json(io.StringIO(context))
-
-        if (cells := init_params.get("document_cells")) is not None:
-            data["init_parameters"]["document_cells"] = [ExtractedTableAnswer.Cell(**c) for c in cells]
-
-        if (cells := init_params.get("context_cells")) is not None:
-            data["init_parameters"]["context_cells"] = [ExtractedTableAnswer.Cell(**c) for c in cells]
-        return default_from_dict(cls, data)
-
-
 @dataclass
 class GeneratedAnswer:
     data: str
 
@@ -3,13 +3,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import hashlib
-import io
-import warnings
 from dataclasses import asdict, dataclass, field, fields
 from typing import Any, Dict, List, Optional
 
 from numpy import ndarray
-from pandas import DataFrame, read_json
 
 from haystack import logging
 from haystack.dataclasses.byte_stream import ByteStream
@@ -28,12 +25,12 @@ def __call__(cls, *args, **kwargs):
         Called before Document.__init__, will remap legacy fields to new ones.
 
         Also handles building a Document from a flattened dictionary.
+        Dataframe is not supported anymore.
         """
-        # Move `content` to new fields depending on the type
+        ### Conversion from 1.x Document ###
         content = kwargs.get("content")
-        if isinstance(content, DataFrame):
-            kwargs["dataframe"] = content
-            del kwargs["content"]
+        if content and not isinstance(content, str):
+            raise ValueError("The `content` field must be a string or None.")
 
         # Not used anymore
         if "content_type" in kwargs:
@@ -55,12 +52,11 @@ class Document(metaclass=_BackwardCompatible):
     """
     Base data class containing some data to be queried.
 
-    Can contain text snippets, tables, and file paths to images or audios. Documents can be sorted by score and saved
+    Can contain text snippets and file paths to images or audios. Documents can be sorted by score and saved
     to/from dictionary and JSON.
 
     :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
     :param content: Text of the document, if the document contains text.
-    :param dataframe: Pandas dataframe with the document's content, if the document contains tabular data.
     :param blob: Binary data associated with the document, if the document has any binary data associated with it.
     :param meta: Additional custom metadata for the document. Must be JSON-serializable.
     :param score: Score of the document. Used for ranking, usually assigned by retrievers.
@@ -70,7 +66,6 @@ class Document(metaclass=_BackwardCompatible):
 
     id: str = field(default="")
     content: Optional[str] = field(default=None)
-    dataframe: Optional[DataFrame] = field(default=None)
     blob: Optional[ByteStream] = field(default=None)
     meta: Dict[str, Any] = field(default_factory=dict)
     score: Optional[float] = field(default=None)
@@ -83,8 +78,6 @@ def __repr__(self):
             fields.append(
                 f"content: '{self.content}'" if len(self.content) < 100 else f"content: '{self.content[:100]}...'"
             )
-        if self.dataframe is not None:
-            fields.append(f"dataframe: {self.dataframe.shape}")
         if self.blob is not None:
             fields.append(f"blob: {len(self.blob.data)} bytes")
         if len(self.meta) > 0:
@@ -115,16 +108,12 @@ def __post_init__(self):
         # Generate an id only if not explicitly set
         self.id = self.id or self._create_id()
 
-        if self.dataframe is not None:
-            msg = "The `dataframe` field is deprecated and will be removed in Haystack 2.11.0."
-            warnings.warn(msg, DeprecationWarning)
-
     def _create_id(self):
         """
         Creates a hash of the given content that acts as the document's ID.
         """
         text = self.content or None
-        dataframe = self.dataframe.to_json() if self.dataframe is not None else None
+        dataframe = None  # this allows the ID creation to remain unchanged even if the dataframe field has been removed
         blob = self.blob.data if self.blob is not None else None
         mime_type = self.blob.mime_type if self.blob is not None else None
         meta = self.meta or {}
@@ -137,14 +126,12 @@ def to_dict(self, flatten=True) -> Dict[str, Any]:
         """
         Converts Document into a dictionary.
 
-        `dataframe` and `blob` fields are converted to JSON-serializable types.
+        `blob` field is converted to a JSON-serializable type.
 
         :param flatten:
             Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
         """
         data = asdict(self)
-        if (dataframe := data.get("dataframe")) is not None:
-            data["dataframe"] = dataframe.to_json()
         if (blob := data.get("blob")) is not None:
             data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}
 
@@ -159,10 +146,8 @@ def from_dict(cls, data: Dict[str, Any]) -> "Document":
         """
         Creates a new Document object from a dictionary.
 
-        The `dataframe` and `blob` fields are converted to their original types.
+        The `blob` field is converted to its original type.
         """
-        if (dataframe := data.get("dataframe")) is not None:
-            data["dataframe"] = read_json(io.StringIO(dataframe))
         if blob := data.get("blob"):
             data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
         if sparse_embedding := data.get("sparse_embedding"):
@@ -198,15 +183,7 @@ def content_type(self):
         Returns the type of the content for the document.
 
         This is necessary to keep backward compatibility with 1.x.
-
-        :raises ValueError:
-            If both `text` and `dataframe` fields are set or both are missing.
         """
-        if self.content is not None and self.dataframe is not None:
-            raise ValueError("Both text and dataframe are set.")
-
         if self.content is not None:
             return "text"
-        elif self.dataframe is not None:
-            return "table"
-        raise ValueError("Neither text nor dataframe is set.")
+        raise ValueError("Content is not set.")
@@ -433,23 +433,9 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
             if document.id in self.storage.keys():
                 self.delete_documents([document.id])
 
-            # This processing logic is extracted from the original bm25_retrieval method.
-            # Since we are creating index incrementally before the first retrieval,
-            # we need to determine what content to use for indexing here, not at query time.
+            tokens = []
             if document.content is not None:
-                if document.dataframe is not None:
-                    logger.warning(
-                        "Document '{document_id}' has both text and dataframe content. "
-                        "Using text content for retrieval and skipping dataframe content.",
-                        document_id=document.id,
-                    )
                 tokens = self._tokenize_bm25(document.content)
-            elif document.dataframe is not None:
-                str_content = document.dataframe.astype(str)
-                csv_content = str_content.to_csv(index=False)
-                tokens = self._tokenize_bm25(csv_content)
-            else:
-                tokens = []
 
             self.storage[document.id] = document
 
@@ -495,13 +481,7 @@ def bm25_retrieval(
         if not query:
             raise ValueError("Query should be a non-empty string")
 
-        content_type_filter = {
-            "operator": "OR",
-            "conditions": [
-                {"field": "content", "operator": "!=", "value": None},
-                {"field": "dataframe", "operator": "!=", "value": None},
-            ],
-        }
+        content_type_filter = {"field": "content", "operator": "!=", "value": None}
         if filters:
             if "operator" not in filters:
                 raise ValueError(
 
@@ -97,7 +97,7 @@ def _write_to_csv(csv_file: str, data: Dict[str, List[Any]]) -> str:
     @staticmethod
     def _handle_output(
         data: Dict[str, List[Any]], output_format: Literal["json", "csv", "df"] = "csv", csv_file: Optional[str] = None
-    ) -> Union[str, DataFrame, Dict[str, List[Any]]]:
+    ) -> Union[str, "DataFrame", Dict[str, List[Any]]]:
         """
         Handles output formatting based on `output_format`.