File tree Expand file tree Collapse file tree 2 files changed +19
-1
lines changed
Expand file tree Collapse file tree 2 files changed +19
-1
lines changed Original file line number Diff line number Diff line change @@ -134,7 +134,15 @@ def featurize(
134134 col_texts = [r [col ] for r in records ]
135135 except KeyError as e :
136136 raise ValueError (f"Missing column '{ col } ' in one or more records" ) from e
137- col_emb = model .encode (col_texts )
137+ try :
138+ col_emb = model .encode (col_texts )
139+ except TypeError as e :
140+ sample_type = type (col_texts [0 ]).__name__ if col_texts else "unknown"
141+ raise TypeError (
142+ f"Failed to encode column '{ col } ' (data type: { sample_type } ). "
143+ f"If encoding non-text data, provide a compatible encoder via the `model` parameter. "
144+ f"See our documentation for more info."
145+ ) from e
138146 embeddings_per_col .append (np .asarray (col_emb ))
139147
140148 return np .concatenate (embeddings_per_col , axis = 1 )
Original file line number Diff line number Diff line change @@ -94,6 +94,16 @@ def test_featurize(model: Encoder) -> None:
9494 with pytest .raises (ValueError , match = "Missing column 'missing'" ):
9595 featurize (records , ["missing" ], model )
9696
97+ # Non-text data with text encoder raises helpful TypeError
98+ class FakeImage :
99+ pass
100+
101+ records_with_images = [{"img" : FakeImage ()}, {"img" : FakeImage ()}]
102+ with pytest .raises (TypeError , match = "Failed to encode column 'img'" ):
103+ featurize (records_with_images , ["img" ], model )
104+ with pytest .raises (TypeError , match = "data type: FakeImage" ):
105+ featurize (records_with_images , ["img" ], model )
106+
97107
98108def test_remove_exact_duplicates () -> None :
99109 """Test exact duplicate removal, with and without reference records."""
You can’t perform that action at this time.
0 commit comments