Skip to content

Commit 25817b5

Browse files
committed
Added informative errors when passing non-text data without a custom encoder
1 parent 88333b9 commit 25817b5

File tree

2 files changed

+19
-1
lines changed

2 files changed

+19
-1
lines changed

semhash/utils.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,15 @@ def featurize(
134134
col_texts = [r[col] for r in records]
135135
except KeyError as e:
136136
raise ValueError(f"Missing column '{col}' in one or more records") from e
137-
col_emb = model.encode(col_texts)
137+
try:
138+
col_emb = model.encode(col_texts)
139+
except TypeError as e:
140+
sample_type = type(col_texts[0]).__name__ if col_texts else "unknown"
141+
raise TypeError(
142+
f"Failed to encode column '{col}' (data type: {sample_type}). "
143+
f"If encoding non-text data, provide a compatible encoder via the `model` parameter. "
144+
f"See our documentation for more info."
145+
) from e
138146
embeddings_per_col.append(np.asarray(col_emb))
139147

140148
return np.concatenate(embeddings_per_col, axis=1)

tests/test_utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,16 @@ def test_featurize(model: Encoder) -> None:
9494
with pytest.raises(ValueError, match="Missing column 'missing'"):
9595
featurize(records, ["missing"], model)
9696

97+
# Non-text data with text encoder raises helpful TypeError
98+
class FakeImage:
99+
pass
100+
101+
records_with_images = [{"img": FakeImage()}, {"img": FakeImage()}]
102+
with pytest.raises(TypeError, match="Failed to encode column 'img'"):
103+
featurize(records_with_images, ["img"], model)
104+
with pytest.raises(TypeError, match="data type: FakeImage"):
105+
featurize(records_with_images, ["img"], model)
106+
97107

98108
def test_remove_exact_duplicates() -> None:
99109
"""Test exact duplicate removal, with and without reference records."""

0 commit comments

Comments
 (0)