33# SPDX-License-Identifier: Apache-2.0
44
55import hashlib
6- import io
7- import warnings
86from dataclasses import asdict , dataclass , field , fields
97from typing import Any , Dict , List , Optional
108
119from numpy import ndarray
12- from pandas import DataFrame , read_json
1310
1411from haystack import logging
1512from haystack .dataclasses .byte_stream import ByteStream
@@ -28,12 +25,12 @@ def __call__(cls, *args, **kwargs):
2825 Called before Document.__init__, will remap legacy fields to new ones.
2926
3027 Also handles building a Document from a flattened dictionary.
28+ Dataframe is not supported anymore.
3129 """
32- # Move `content` to new fields depending on the type
30+ ### Conversion from 1.x Document ###
3331 content = kwargs .get ("content" )
34- if isinstance (content , DataFrame ):
35- kwargs ["dataframe" ] = content
36- del kwargs ["content" ]
32+ if content and not isinstance (content , str ):
33+ raise ValueError ("The `content` field must be a string or None." )
3734
3835 # Not used anymore
3936 if "content_type" in kwargs :
@@ -55,12 +52,11 @@ class Document(metaclass=_BackwardCompatible):
5552 """
5653 Base data class containing some data to be queried.
5754
58- Can contain text snippets, tables, and file paths to images or audios. Documents can be sorted by score and saved
55+ Can contain text snippets and file paths to images or audios. Documents can be sorted by score and saved
5956 to/from dictionary and JSON.
6057
6158 :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
6259 :param content: Text of the document, if the document contains text.
63- :param dataframe: Pandas dataframe with the document's content, if the document contains tabular data.
6460 :param blob: Binary data associated with the document, if the document has any binary data associated with it.
6561 :param meta: Additional custom metadata for the document. Must be JSON-serializable.
6662 :param score: Score of the document. Used for ranking, usually assigned by retrievers.
@@ -70,7 +66,6 @@ class Document(metaclass=_BackwardCompatible):
7066
7167 id : str = field (default = "" )
7268 content : Optional [str ] = field (default = None )
73- dataframe : Optional [DataFrame ] = field (default = None )
7469 blob : Optional [ByteStream ] = field (default = None )
7570 meta : Dict [str , Any ] = field (default_factory = dict )
7671 score : Optional [float ] = field (default = None )
@@ -83,8 +78,6 @@ def __repr__(self):
8378 fields .append (
8479 f"content: '{ self .content } '" if len (self .content ) < 100 else f"content: '{ self .content [:100 ]} ...'"
8580 )
86- if self .dataframe is not None :
87- fields .append (f"dataframe: { self .dataframe .shape } " )
8881 if self .blob is not None :
8982 fields .append (f"blob: { len (self .blob .data )} bytes" )
9083 if len (self .meta ) > 0 :
@@ -115,16 +108,12 @@ def __post_init__(self):
115108 # Generate an id only if not explicitly set
116109 self .id = self .id or self ._create_id ()
117110
118- if self .dataframe is not None :
119- msg = "The `dataframe` field is deprecated and will be removed in Haystack 2.11.0."
120- warnings .warn (msg , DeprecationWarning )
121-
122111 def _create_id (self ):
123112 """
124113 Creates a hash of the given content that acts as the document's ID.
125114 """
126115 text = self .content or None
127- dataframe = self . dataframe . to_json () if self . dataframe is not None else None
116+ dataframe = None # this allows the ID creation to remain unchanged even if the dataframe field has been removed
128117 blob = self .blob .data if self .blob is not None else None
129118 mime_type = self .blob .mime_type if self .blob is not None else None
130119 meta = self .meta or {}
@@ -137,14 +126,12 @@ def to_dict(self, flatten=True) -> Dict[str, Any]:
137126 """
138127 Converts Document into a dictionary.
139128
140- `dataframe` and ` blob` fields are converted to JSON-serializable types .
129+ `blob` field is converted to a JSON-serializable type .
141130
142131 :param flatten:
143132 Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
144133 """
145134 data = asdict (self )
146- if (dataframe := data .get ("dataframe" )) is not None :
147- data ["dataframe" ] = dataframe .to_json ()
148135 if (blob := data .get ("blob" )) is not None :
149136 data ["blob" ] = {"data" : list (blob ["data" ]), "mime_type" : blob ["mime_type" ]}
150137
@@ -159,10 +146,8 @@ def from_dict(cls, data: Dict[str, Any]) -> "Document":
159146 """
160147 Creates a new Document object from a dictionary.
161148
162- The `dataframe` and ` blob` fields are converted to their original types .
149+ The `blob` field is converted to its original type .
163150 """
164- if (dataframe := data .get ("dataframe" )) is not None :
165- data ["dataframe" ] = read_json (io .StringIO (dataframe ))
166151 if blob := data .get ("blob" ):
167152 data ["blob" ] = ByteStream (data = bytes (blob ["data" ]), mime_type = blob ["mime_type" ])
168153 if sparse_embedding := data .get ("sparse_embedding" ):
@@ -198,15 +183,7 @@ def content_type(self):
198183 Returns the type of the content for the document.
199184
200185 This is necessary to keep backward compatibility with 1.x.
201-
202- :raises ValueError:
203- If both `text` and `dataframe` fields are set or both are missing.
204186 """
205- if self .content is not None and self .dataframe is not None :
206- raise ValueError ("Both text and dataframe are set." )
207-
208187 if self .content is not None :
209188 return "text"
210- elif self .dataframe is not None :
211- return "table"
212- raise ValueError ("Neither text nor dataframe is set." )
189+ raise ValueError ("Content is not set." )
0 commit comments