filterblock: Don't break when filtering on unexpected data

russellb · shivchander · oindrillac · commit 51545851874e · 2024-07-08T14:04:34.000-04:00
The previous code in this block did filtering assuming that all samples had a value that was correct for the type. For example, when filtering on an integer value, it assumed every row had a valid integer, where it may instead have garbage. This change introduces a new helper, _convert_dtype(), which properly handles this condition. When the conversion fails on a `ValueError` exception, it treats it as `None` instead of allowing the exception to be raised up to the caller. The fix was authored by Shiv in PR instructlab#72. I only pulled it out into a standalone commit. Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: shiv <shivchander.s30@gmail.com>
diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
@@ -20,13 +20,20 @@ def __init__(
         self.convert_dtype = convert_dtype
         self.num_procs = batch_kwargs.get("num_procs", 1)
 
+    def _convert_dtype(self, sample):
+        try:
+            sample[self.column_name] = self.convert_dtype(sample[self.column_name])
+        except ValueError as e:
+            logger.error(
+                "Error converting dtype: %s, filling with None to be filtered later", e
+            )
+            sample[self.column_name] = None
+        return sample
+
     def generate(self, samples) -> Dataset:
         if self.convert_dtype:
             samples = samples.map(
-                lambda x: {
-                    **x,
-                    self.column_name: self.convert_dtype(x[self.column_name]),
-                },
+                self._convert_dtype,
                 num_proc=self.num_procs,
             )