Skip to content

Commit f3c44be

Browse files
refactor!: remove dataframe field from Document and ExtractedTableAnswer; make pandas optional (#8906)
* remove dataframe * release note * small fix * group imports * Update pyproject.toml Co-authored-by: Julian Risch <[email protected]> * Update pyproject.toml Co-authored-by: Julian Risch <[email protected]> * address feedback --------- Co-authored-by: Julian Risch <[email protected]>
1 parent 0d65b4c commit f3c44be

File tree

28 files changed

+70
-478
lines changed

28 files changed

+70
-478
lines changed

haystack/components/converters/azure.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from azure.core.credentials import AzureKeyCredential
2424

2525
with LazyImport(message="Run 'pip install pandas'") as pandas_import:
26-
import pandas as pd
26+
from pandas import DataFrame
2727

2828

2929
@component
@@ -306,7 +306,7 @@ def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]
306306
table_meta["page"] = table.bounding_regions[0].page_number
307307

308308
# Convert table to CSV
309-
table_df = pd.DataFrame(data=table_list)
309+
table_df = DataFrame(data=table_list)
310310
table_content = table_df.to_csv(header=False, index=False, lineterminator="\n")
311311
converted_tables.append(Document(content=table_content, meta=table_meta))
312312

haystack/components/converters/xlsx.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,16 @@
77
from pathlib import Path
88
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
99

10-
import pandas as pd
11-
1210
from haystack import Document, component, logging
1311
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
1412
from haystack.dataclasses import ByteStream
1513
from haystack.lazy_imports import LazyImport
1614

1715
logger = logging.getLogger(__name__)
1816

19-
with LazyImport("Run 'pip install openpyxl'") as xlsx_import:
17+
with LazyImport("Run 'pip install pandas openpyxl'") as pandas_xlsx_import:
2018
import openpyxl # pylint: disable=unused-import # the library is used but not directly referenced
19+
import pandas as pd
2120

2221
with LazyImport("Run 'pip install tabulate'") as tabulate_import:
2322
from tabulate import tabulate # pylint: disable=unused-import # the library is used but not directly referenced
@@ -69,7 +68,7 @@ def __init__(
6968
If True, the full path of the file is stored in the metadata of the document.
7069
If False, only the file name is stored.
7170
"""
72-
xlsx_import.check()
71+
pandas_xlsx_import.check()
7372
self.table_format = table_format
7473
if table_format not in ["csv", "markdown"]:
7574
raise ValueError(f"Unsupported export format: {table_format}. Choose either 'csv' or 'markdown'.")

haystack/components/joiners/answer_joiner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99

1010
from haystack import component, default_from_dict, default_to_dict, logging
1111
from haystack.core.component.types import Variadic
12-
from haystack.dataclasses.answer import ExtractedAnswer, ExtractedTableAnswer, GeneratedAnswer
12+
from haystack.dataclasses.answer import ExtractedAnswer, GeneratedAnswer
1313

14-
AnswerType = Union[GeneratedAnswer, ExtractedTableAnswer, ExtractedAnswer]
14+
AnswerType = Union[GeneratedAnswer, ExtractedAnswer]
1515

1616
logger = logging.getLogger(__name__)
1717

haystack/components/preprocessors/document_cleaner.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,6 @@ def run(self, documents: List[Document]):
134134
clean_doc = Document(
135135
id=doc.id if self.keep_id else "",
136136
content=text,
137-
dataframe=doc.dataframe,
138137
blob=doc.blob,
139138
meta=deepcopy(doc.meta),
140139
score=doc.score,

haystack/dataclasses/answer.py

Lines changed: 0 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,9 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5-
import io
6-
import warnings
75
from dataclasses import asdict, dataclass, field
86
from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
97

10-
from pandas import DataFrame, read_json
11-
128
from haystack.core.serialization import default_from_dict, default_to_dict
139
from haystack.dataclasses.document import Document
1410

@@ -88,75 +84,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "ExtractedAnswer":
8884
return default_from_dict(cls, data)
8985

9086

91-
@dataclass
92-
class ExtractedTableAnswer:
93-
query: str
94-
score: float
95-
data: Optional[str] = None
96-
document: Optional[Document] = None
97-
context: Optional[DataFrame] = None
98-
document_cells: List["Cell"] = field(default_factory=list)
99-
context_cells: List["Cell"] = field(default_factory=list)
100-
meta: Dict[str, Any] = field(default_factory=dict)
101-
102-
def __post_init__(self):
103-
msg = "The `ExtractedTableAnswer` dataclass is deprecated and will be removed in Haystack 2.11.0."
104-
warnings.warn(msg, DeprecationWarning)
105-
106-
@dataclass
107-
class Cell:
108-
row: int
109-
column: int
110-
111-
def to_dict(self) -> Dict[str, Any]:
112-
"""
113-
Serialize the object to a dictionary.
114-
115-
:returns:
116-
Serialized dictionary representation of the object.
117-
"""
118-
document = self.document.to_dict(flatten=False) if self.document is not None else None
119-
context = self.context.to_json() if self.context is not None else None
120-
document_cells = [asdict(c) for c in self.document_cells]
121-
context_cells = [asdict(c) for c in self.context_cells]
122-
return default_to_dict(
123-
self,
124-
data=self.data,
125-
query=self.query,
126-
document=document,
127-
context=context,
128-
score=self.score,
129-
document_cells=document_cells,
130-
context_cells=context_cells,
131-
meta=self.meta,
132-
)
133-
134-
@classmethod
135-
def from_dict(cls, data: Dict[str, Any]) -> "ExtractedTableAnswer":
136-
"""
137-
Deserialize the object from a dictionary.
138-
139-
:param data:
140-
Dictionary representation of the object.
141-
142-
:returns:
143-
Deserialized object.
144-
"""
145-
init_params = data.get("init_parameters", {})
146-
if (doc := init_params.get("document")) is not None:
147-
data["init_parameters"]["document"] = Document.from_dict(doc)
148-
149-
if (context := init_params.get("context")) is not None:
150-
data["init_parameters"]["context"] = read_json(io.StringIO(context))
151-
152-
if (cells := init_params.get("document_cells")) is not None:
153-
data["init_parameters"]["document_cells"] = [ExtractedTableAnswer.Cell(**c) for c in cells]
154-
155-
if (cells := init_params.get("context_cells")) is not None:
156-
data["init_parameters"]["context_cells"] = [ExtractedTableAnswer.Cell(**c) for c in cells]
157-
return default_from_dict(cls, data)
158-
159-
16087
@dataclass
16188
class GeneratedAnswer:
16289
data: str

haystack/dataclasses/document.py

Lines changed: 9 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,10 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
import hashlib
6-
import io
7-
import warnings
86
from dataclasses import asdict, dataclass, field, fields
97
from typing import Any, Dict, List, Optional
108

119
from numpy import ndarray
12-
from pandas import DataFrame, read_json
1310

1411
from haystack import logging
1512
from haystack.dataclasses.byte_stream import ByteStream
@@ -28,12 +25,12 @@ def __call__(cls, *args, **kwargs):
2825
Called before Document.__init__, will remap legacy fields to new ones.
2926
3027
Also handles building a Document from a flattened dictionary.
28+
Dataframe is not supported anymore.
3129
"""
32-
# Move `content` to new fields depending on the type
30+
### Conversion from 1.x Document ###
3331
content = kwargs.get("content")
34-
if isinstance(content, DataFrame):
35-
kwargs["dataframe"] = content
36-
del kwargs["content"]
32+
if content and not isinstance(content, str):
33+
raise ValueError("The `content` field must be a string or None.")
3734

3835
# Not used anymore
3936
if "content_type" in kwargs:
@@ -55,12 +52,11 @@ class Document(metaclass=_BackwardCompatible):
5552
"""
5653
Base data class containing some data to be queried.
5754
58-
Can contain text snippets, tables, and file paths to images or audios. Documents can be sorted by score and saved
55+
Can contain text snippets and file paths to images or audios. Documents can be sorted by score and saved
5956
to/from dictionary and JSON.
6057
6158
:param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
6259
:param content: Text of the document, if the document contains text.
63-
:param dataframe: Pandas dataframe with the document's content, if the document contains tabular data.
6460
:param blob: Binary data associated with the document, if the document has any binary data associated with it.
6561
:param meta: Additional custom metadata for the document. Must be JSON-serializable.
6662
:param score: Score of the document. Used for ranking, usually assigned by retrievers.
@@ -70,7 +66,6 @@ class Document(metaclass=_BackwardCompatible):
7066

7167
id: str = field(default="")
7268
content: Optional[str] = field(default=None)
73-
dataframe: Optional[DataFrame] = field(default=None)
7469
blob: Optional[ByteStream] = field(default=None)
7570
meta: Dict[str, Any] = field(default_factory=dict)
7671
score: Optional[float] = field(default=None)
@@ -83,8 +78,6 @@ def __repr__(self):
8378
fields.append(
8479
f"content: '{self.content}'" if len(self.content) < 100 else f"content: '{self.content[:100]}...'"
8580
)
86-
if self.dataframe is not None:
87-
fields.append(f"dataframe: {self.dataframe.shape}")
8881
if self.blob is not None:
8982
fields.append(f"blob: {len(self.blob.data)} bytes")
9083
if len(self.meta) > 0:
@@ -115,16 +108,12 @@ def __post_init__(self):
115108
# Generate an id only if not explicitly set
116109
self.id = self.id or self._create_id()
117110

118-
if self.dataframe is not None:
119-
msg = "The `dataframe` field is deprecated and will be removed in Haystack 2.11.0."
120-
warnings.warn(msg, DeprecationWarning)
121-
122111
def _create_id(self):
123112
"""
124113
Creates a hash of the given content that acts as the document's ID.
125114
"""
126115
text = self.content or None
127-
dataframe = self.dataframe.to_json() if self.dataframe is not None else None
116+
dataframe = None # this allows the ID creation to remain unchanged even if the dataframe field has been removed
128117
blob = self.blob.data if self.blob is not None else None
129118
mime_type = self.blob.mime_type if self.blob is not None else None
130119
meta = self.meta or {}
@@ -137,14 +126,12 @@ def to_dict(self, flatten=True) -> Dict[str, Any]:
137126
"""
138127
Converts Document into a dictionary.
139128
140-
`dataframe` and `blob` fields are converted to JSON-serializable types.
129+
`blob` field is converted to a JSON-serializable type.
141130
142131
:param flatten:
143132
Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
144133
"""
145134
data = asdict(self)
146-
if (dataframe := data.get("dataframe")) is not None:
147-
data["dataframe"] = dataframe.to_json()
148135
if (blob := data.get("blob")) is not None:
149136
data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}
150137

@@ -159,10 +146,8 @@ def from_dict(cls, data: Dict[str, Any]) -> "Document":
159146
"""
160147
Creates a new Document object from a dictionary.
161148
162-
The `dataframe` and `blob` fields are converted to their original types.
149+
The `blob` field is converted to its original type.
163150
"""
164-
if (dataframe := data.get("dataframe")) is not None:
165-
data["dataframe"] = read_json(io.StringIO(dataframe))
166151
if blob := data.get("blob"):
167152
data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
168153
if sparse_embedding := data.get("sparse_embedding"):
@@ -198,15 +183,7 @@ def content_type(self):
198183
Returns the type of the content for the document.
199184
200185
This is necessary to keep backward compatibility with 1.x.
201-
202-
:raises ValueError:
203-
If both `text` and `dataframe` fields are set or both are missing.
204186
"""
205-
if self.content is not None and self.dataframe is not None:
206-
raise ValueError("Both text and dataframe are set.")
207-
208187
if self.content is not None:
209188
return "text"
210-
elif self.dataframe is not None:
211-
return "table"
212-
raise ValueError("Neither text nor dataframe is set.")
189+
raise ValueError("Content is not set.")

haystack/document_stores/in_memory/document_store.py

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -433,23 +433,9 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
433433
if document.id in self.storage.keys():
434434
self.delete_documents([document.id])
435435

436-
# This processing logic is extracted from the original bm25_retrieval method.
437-
# Since we are creating index incrementally before the first retrieval,
438-
# we need to determine what content to use for indexing here, not at query time.
436+
tokens = []
439437
if document.content is not None:
440-
if document.dataframe is not None:
441-
logger.warning(
442-
"Document '{document_id}' has both text and dataframe content. "
443-
"Using text content for retrieval and skipping dataframe content.",
444-
document_id=document.id,
445-
)
446438
tokens = self._tokenize_bm25(document.content)
447-
elif document.dataframe is not None:
448-
str_content = document.dataframe.astype(str)
449-
csv_content = str_content.to_csv(index=False)
450-
tokens = self._tokenize_bm25(csv_content)
451-
else:
452-
tokens = []
453439

454440
self.storage[document.id] = document
455441

@@ -495,13 +481,7 @@ def bm25_retrieval(
495481
if not query:
496482
raise ValueError("Query should be a non-empty string")
497483

498-
content_type_filter = {
499-
"operator": "OR",
500-
"conditions": [
501-
{"field": "content", "operator": "!=", "value": None},
502-
{"field": "dataframe", "operator": "!=", "value": None},
503-
],
504-
}
484+
content_type_filter = {"field": "content", "operator": "!=", "value": None}
505485
if filters:
506486
if "operator" not in filters:
507487
raise ValueError(

haystack/evaluation/eval_run_result.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def _write_to_csv(csv_file: str, data: Dict[str, List[Any]]) -> str:
9797
@staticmethod
9898
def _handle_output(
9999
data: Dict[str, List[Any]], output_format: Literal["json", "csv", "df"] = "csv", csv_file: Optional[str] = None
100-
) -> Union[str, DataFrame, Dict[str, List[Any]]]:
100+
) -> Union[str, "DataFrame", Dict[str, List[Any]]]:
101101
"""
102102
Handles output formatting based on `output_format`.
103103

0 commit comments

Comments
 (0)