Refactor WebDataset reader and reduce code density

VibhuJawa · VibhuJawa · commit c9cb74337df6 · 2026-02-17T18:50:55.000Z
diff --git a/WEBDATASET_REDUCTION_PLAN.md b/WEBDATASET_REDUCTION_PLAN.md
@@ -0,0 +1,68 @@
+# WebDataset Reader Reduction Plan
+
+## Scope
+Target file: `nemo_curator/stages/multimodal/io/readers/webdataset.py`
+
+Goal: reduce density and repeated logic while preserving behavior and test outcomes.
+
+## Principles
+1. No behavior changes unless explicitly called out and reviewed.
+2. Keep exception semantics stable (`raise`/`skip`/`log`).
+3. Run tests after each phase, not only at the end.
+4. Prefer removing duplicated branches over adding many small helpers.
+
+## Current Pain Points
+1. Interleaved row creation has repeated text/image branch logic.
+2. Text-member parsing combines multiple concerns in one path.
+3. Fallback/error handling around interleaved parsing is harder to scan than needed.
+4. Member-type dispatch has repeated suffix handling patterns.
+
+## Reduction Phases
+
+### Phase 1: Control-flow cleanup (low risk)
+1. Keep current behavior, but collapse redundant exception branches where invariant checks already exist.
+2. Keep one clear fallback path for `sample_format != "interleaved"`.
+3. Ensure all error messages that tests rely on remain unchanged.
+
+Acceptance:
+1. Existing multimodal tests pass unchanged.
+2. No schema/output changes.
+
+### Phase 2: Duplicate row-construction removal (low-medium risk)
+1. Centralize interleaved segment row construction into one reusable function.
+2. Reuse that function from all interleaved segment paths.
+3. Keep modality checks and metadata JSON population exactly as today.
+
+Acceptance:
+1. Interleaved reader/writer roundtrip tests pass.
+2. Element metadata JSON behavior remains identical.
+
+### Phase 3: Text-member path decomposition (medium risk)
+1. Split JSON text-member path from plain text-member path for readability.
+2. Keep shared row finalization in one place to avoid re-duplicating code.
+3. Retain existing sample-id/position assignment rules.
+
+Acceptance:
+1. Non-interleaved JSON fallback behavior remains unchanged.
+2. Metadata sidecar population stays first-wins and identical to current tests.
+
+### Phase 4: Optional line-count pass (optional)
+1. Re-evaluate helper count vs readability.
+2. Inline helpers that only wrap one call and do not improve clarity.
+3. Keep final structure flat and easy to follow.
+
+Acceptance:
+1. Net reduction in repeated branches and lines where practical.
+2. No decrease in maintainability/readability.
+
+## Test Gate Per Phase
+Run:
+1. `pytest -q tests/stages/multimodal/test_writer_output_formats.py`
+2. `pytest -q tests/stages/multimodal/test_parquet_reader.py`
+
+## Deliverables
+1. One commit per phase (or grouped Phase 1+2 if very small).
+2. Updated diff summary with:
+1. duplicated branches removed
+2. net insertions/deletions
+3. test results
diff --git a/nemo_curator/stages/multimodal/io/readers/webdataset.py b/nemo_curator/stages/multimodal/io/readers/webdataset.py
@@ -160,15 +160,15 @@ def default_interleaved_field_map() -> dict[str, str]:
     def __post_init__(self) -> None:
         """Validate reader configuration."""
         super().__post_init__()
-        if self.sample_format not in _SUPPORTED_SAMPLE_FORMATS:
-            msg = f"Unsupported sample_format='{self.sample_format}'. Expected one of: auto, simple, interleaved"
-            raise ValueError(msg)
-        if self.modalities_to_load not in _SUPPORTED_MODALITIES_TO_LOAD:
-            msg = f"Unsupported modalities_to_load='{self.modalities_to_load}'. Expected one of: all, image, text"
-            raise ValueError(msg)
-        if self.error_handling not in _SUPPORTED_ERROR_HANDLING:
-            msg = f"Unsupported error_handling='{self.error_handling}'. Expected one of: raise, skip, log"
-            raise ValueError(msg)
+        for field_name, value, supported in (
+            ("sample_format", self.sample_format, _SUPPORTED_SAMPLE_FORMATS),
+            ("modalities_to_load", self.modalities_to_load, _SUPPORTED_MODALITIES_TO_LOAD),
+            ("error_handling", self.error_handling, _SUPPORTED_ERROR_HANDLING),
+        ):
+            if value not in supported:
+                options = ", ".join(sorted(supported))
+                msg = f"Unsupported {field_name}='{value}'. Expected one of: {options}"
+                raise ValueError(msg)
         default_map = self.default_interleaved_field_map()
         unknown = sorted(set(self.interleaved_field_map or {}) - set(default_map))
         if unknown:
@@ -244,25 +244,22 @@ def _rows_from_text_member(
         source: RowSource,
     ) -> list[dict[str, object]]:
         if suffix == ".json":
-            parsed = self._maybe_rows_from_interleaved_json_member(payload, source, state, member_name)
-            if parsed is not None:
-                return parsed
+            if payload is None:
+                msg = f"JSON member '{member_name}' missing payload bytes"
+                raise WebDatasetMemberParseError(msg)
+            try:
+                return self._rows_from_interleaved_json(payload, source, state)
+            except WebDatasetMemberParseError:
+                if self.sample_format == "interleaved":
+                    raise
         if not self._loads_modality("text"):
             return []
         sid, position = self._next_sample_and_position(state.sample_counters, member_name, "text")
         text_content = self._decode_text_payload(payload, member_name)
         content_type = "application/json" if suffix == ".json" else "text/plain"
         if suffix == ".json":
             _record_metadata_row(state, sid, text_content or "{}")
-        return [
-            self._text_row(
-                sid=sid,
-                position=position,
-                source_shard=source.source_shard,
-                content_type=content_type,
-                text_content=text_content,
-            )
-        ]
+        return [self._text_row(sid=sid, position=position, source_shard=source.source_shard, content_type=content_type, text_content=text_content)]
 
     def _rows_from_interleaved_json(
         self,
@@ -275,11 +272,11 @@ def _rows_from_interleaved_json(
         sample_payload = dict(decoded)
         sample_payload.pop(self.interleaved_field_map["segments"], None)
         _record_metadata_row(state, sample_id, self._json_or_none(sample_payload) or "{}")
-        rows: list[dict[str, object]] = []
         field_map = self.interleaved_field_map
         modality_field = field_map["modality"]
         text_field = field_map["text"]
         content_key_field = field_map["content_key"]
+        rows: list[dict[str, object]] = []
         for idx, segment in enumerate(segments):
             modality = _required_segment_str(segment, modality_field)
             if modality not in _SUPPORTED_INTERLEAVED_MODALITIES:
@@ -288,53 +285,31 @@ def _rows_from_interleaved_json(
                     "in WebDatasetReaderStage (supported: text, image)"
                 )
                 raise WebDatasetMemberParseError(msg)
-            if self._loads_modality(modality):
-                if modality == "text":
-                    rows.append(
-                        self._text_row(
-                            sid=sample_id,
-                            position=idx,
-                            source_shard=source.source_shard,
-                            content_type="text/plain",
-                            text_content=_required_segment_str(segment, text_field),
-                            element_metadata_json=self._json_or_none(segment),
-                        )
-                    )
-                else:
-                    rows.append(
-                        self._image_row(
-                            sid=sample_id,
-                            position=idx,
-                            source=source,
-                            content_key=_required_segment_str(segment, content_key_field),
-                            element_metadata_json=self._json_or_none(segment),
-                        )
+            if not self._loads_modality(modality):
+                continue
+            if modality == "text":
+                rows.append(
+                    self._text_row(
+                        sid=sample_id,
+                        position=idx,
+                        source_shard=source.source_shard,
+                        content_type="text/plain",
+                        text_content=_required_segment_str(segment, text_field),
+                        element_metadata_json=self._json_or_none(segment),
                     )
+                )
+                continue
+            rows.append(
+                self._image_row(
+                    sid=sample_id,
+                    position=idx,
+                    source=source,
+                    content_key=_required_segment_str(segment, content_key_field),
+                    element_metadata_json=self._json_or_none(segment),
+                )
+            )
         return rows
 
-    def _maybe_rows_from_interleaved_json_member(
-        self,
-        payload: bytes | None,
-        source: RowSource,
-        state: RowBuildState,
-        member_name: str,
-    ) -> list[dict[str, object]] | None:
-        if payload is None:
-            msg = f"JSON member '{member_name}' missing payload bytes"
-            raise WebDatasetMemberParseError(msg)
-        try:
-            parsed = self._rows_from_interleaved_json(payload, source, state)
-        except WebDatasetMemberParseError:
-            if self.sample_format == "interleaved":
-                raise
-            return None
-        except KeyError as err:
-            if self.sample_format == "interleaved":
-                msg = f"Interleaved JSON missing required field: {err}"
-                raise WebDatasetMemberParseError(msg) from err
-            return None
-        return parsed
-
     @staticmethod
     def _decode_text_payload(payload: bytes | None, member_name: str) -> str:
         if payload is None:
@@ -368,15 +343,7 @@ def _rows_from_binary_member(
         if not self._loads_modality(modality):
             return []
         sid, position = self._next_sample_and_position(state.sample_counters, member_name, modality)
-        return [
-            self._image_row(
-                sid=sid,
-                position=position,
-                source=source,
-                content_key=member_name,
-                binary_content=payload if self.load_binary else None,
-            )
-        ]
+        return [self._image_row(sid=sid, position=position, source=source, content_key=member_name, binary_content=payload if self.load_binary else None)]
 
     def _next_sample_and_position(
         self,