scaffold

andreatgretel · andreatgretel · commit b15e2ba552a1 · 2025-11-11T19:14:32.000-03:00
diff --git a/examples/.gitignore b/examples/.gitignore
@@ -0,0 +1 @@
+artifacts
diff --git a/examples/example.py b/examples/example.py
@@ -0,0 +1,217 @@
+from data_designer.essentials import (
+    DataDesignerConfigBuilder,
+    ModelConfig,
+    InferenceParameters,
+    SamplerColumnConfig,
+    CategorySamplerParams,
+    SubcategorySamplerParams,
+    PersonSamplerParams,
+    LLMTextColumnConfig,
+    Score,
+    DataDesigner,
+    ToJsonlProcessorConfig,
+)
+
+# define model aliases
+model_alias_generator = "content_generator"
+model_configs = [
+    ModelConfig(
+        alias=model_alias_generator,
+        provider="nvidia",
+        model="deepseek-ai/deepseek-r1-distill-qwen-14b",
+        inference_parameters=InferenceParameters(
+            max_tokens=8000,
+            temperature=0.7,
+            top_p=0.95,
+        ),
+    )
+]
+
+config_builder = DataDesignerConfigBuilder(model_configs=model_configs)
+
+# ESI levels
+ESI_LEVELS = [
+    "ESI 1: Resuscitation",
+    "ESI 2: Emergency",
+    "ESI 3: Urgent",
+    "ESI 4: Less Urgent",
+    "ESI 5: Non-urgent",
+]
+
+# Unique record ID
+config_builder.add_column(
+    name="record_id",
+    column_type="sampler",
+    sampler_type="uuid",
+    params={"short_form": True, "uppercase": True}
+)
+
+# ESI level (balanced sampling)
+config_builder.add_column(
+    SamplerColumnConfig(
+        name="esi_level_description",
+        sampler_type="category",
+        params=CategorySamplerParams(
+            values=ESI_LEVELS,
+        ),
+    )
+)
+
+# Clinical scenario (conditioned on ESI level)
+config_builder.add_column(
+    SamplerColumnConfig(
+        name="clinical_scenario",
+        sampler_type="subcategory",
+        params=SubcategorySamplerParams(
+            category="esi_level_description",
+            values={
+                ESI_LEVELS[0]: [
+                    "Cardiac arrest",
+                    "Unresponsive with no pulse",
+                    "Severe respiratory distress",
+                    "Major trauma with signs of shock",
+                    "Suspected narcotic overdose with shallow respirations",
+                ],
+                ESI_LEVELS[1]: [
+                    "Crushing substernal chest pain radiating to the left arm",
+                    "Sudden onset of facial droop and arm weakness",
+                    "New onset confusion in an elderly patient",
+                    "Active suicidal ideation with a plan",
+                    "High-speed motor vehicle accident",
+                    "Severe abdominal pain in a patient with a history of aortic aneurysm",
+                ],
+                ESI_LEVELS[2]: [
+                    "Abdominal pain with fever and nausea",
+                    "High fever with a productive cough and history of COPD",
+                    "Displaced fracture with visible deformity",
+                    "Asthma attack, responsive to initial treatment",
+                    "Vaginal bleeding in a pregnant patient",
+                    "Head injury with brief loss of consciousness",
+                ],
+                ESI_LEVELS[3]: [
+                    "Simple laceration requiring sutures",
+                    "Twisted ankle, unable to bear weight",
+                    "Sore throat with fever",
+                    "Symptoms of a urinary tract infection",
+                    "Painful ear with fever in a child",
+                ],
+                ESI_LEVELS[4]: [
+                    "Request for a prescription refill",
+                    "Suture removal",
+                    "Minor rash present for several days",
+                    "Common cold symptoms",
+                    "Follow-up for a minor wound check",
+                ],
+            },
+        ),
+    )
+)
+
+# Synthetic patient info
+config_builder.add_column(
+    SamplerColumnConfig(
+        name="patient",
+        sampler_type="person",
+        params=PersonSamplerParams(age_range=[18, 70]),
+    )
+)
+
+# Triage note writing style (captures range from poor to best quality notes)
+config_builder.add_column(
+    SamplerColumnConfig(
+        name="writing_style",
+        sampler_type="category",
+        params=CategorySamplerParams(
+            values=["Draft", "Adequate", "Polished"]
+        ),
+    )
+)
+
+# LLM-generated triage note
+config_builder.add_column(
+    LLMTextColumnConfig(
+        name="content",
+        prompt=(
+            "You are an experienced triage nurse in a busy Emergency Department writing a draft note. "
+            "Write a realistic, concise triage note in a telegraphic style using common medical abbreviations. "
+            "The note is for a {{ patient.age }} y/o {{ 'M' if patient.sex == 'Male' else 'F' }}. "
+            "Triage classification: '{{ esi_level_description }}'. "
+            "Reason for visit: '{{ clinical_scenario }}'. "
+            "Desired writing style: '{{ writing_style }}'. "
+            "Structure the note with 'CC:' and 'HPI:'. "
+            "Adjust the style and level of clinical detail based on the 'writing_style': "
+            "- Draft: Use minimal structure, brief statements, and omit some details; clinical indicators may be less clear. "
+            "- Adequate: Use complete sentences, include all relevant clinical indicators, but avoid excessive detail. "
+            "- Polished: Be thorough, precise, and clear; include nuanced or subtle signs and show strong clinical reasoning. "
+            "Also, adjust level of detail based on urgency (ESI 1 is always brief). "
+            "Respond with ONLY the note text, starting with 'CC:'."
+        ),
+        model_alias=model_alias_generator,
+    )
+)
+
+# Rubric: clinical coherence
+clinical_coherence_rubric = Score(
+    name="Clinical Coherence",
+    description="Evaluates how well the clinical details in the triage note align with the assigned ESI level and scenario.",
+    options={
+        "5": "Note is perfectly aligned with the ESI level and scenario; details are clinically plausible and specific.",
+        "4": "Note is well-aligned, with only minor details that might be slightly inconsistent.",
+        "3": "Note is generally consistent, but some key clinical indicators are missing or don't fully match the ESI level.",
+        "2": "Note shows significant inconsistency between the clinical details and the assigned ESI level.",
+        "1": "Note is clinically incoherent and does not reflect the assigned ESI level or scenario at all."
+    }
+)
+
+# Rubric: ESI level complexity (reduced to 3 levels: Simple, Moderate, Complex)
+esi_level_complexity_rubric = Score(
+    name="ESI Level Complexity",
+    description="Evaluates how difficult it is to infer the correct ESI level from the note. Higher scores indicate greater complexity, which is desirable for creating a challenging dataset.",
+    options={
+        "Complex": "Note contains subtle or conflicting information, requiring clinical reasoning to distinguish between ESI levels.",
+        "Moderate": "Note requires some clinical inference; indicators are present but not always immediately obvious.",
+        "Simple": "Note uses clear, direct, or textbook indicators that make the ESI level obvious."
+    }
+)
+
+jsonl_entry_template = {
+    "messages": [
+        {
+            "role": "system",
+            "content": (
+                "You are an expert ER triage nurse. Your task is to classify the following triage note into one of the five Emergency Severity Index (ESI) levels."
+                f" The possible levels are: {', '.join([repr(level) for level in ESI_LEVELS])}."
+                " Carefully analyze the clinical details in the triage note, focusing on patient acuity, resource needs, and risk of rapid deterioration."
+                " Respond with only the selected ESI level description, exactly matching one of the listed possibilities. Do not provide extra text or explanation."
+            )
+        },
+        {
+            "role": "user",
+            "content": (
+                "Triage Note: {{ content }}\n"
+                "Classify the ESI level for this note based on the provided definitions."
+                " Respond in JSON format only: { \"esi_level_description\": \"...\" }"
+            )
+        },
+        {
+            "role": "assistant",
+            "content": (
+                '{ "esi_level_description": "{{ esi_level_description }}" }'
+            )
+        },
+    ],
+}
+
+config_builder.add_processor(
+    ToJsonlProcessorConfig(
+        template=jsonl_entry_template,
+        folder_name="jsonl_files",
+        fraction_per_file={
+            "train.jsonl": 0.8,
+            "validation.jsonl": 0.2,
+        },
+    )
+)
+
+dd = DataDesigner(artifact_path="./artifacts", blob_storage_path="/Users/amanoel/Data/nemotron-personas-datasets_v0.0.6")
+preview = dd.preview(config_builder, num_records=10)
diff --git a/src/data_designer/config/processors.py b/src/data_designer/config/processors.py
@@ -15,11 +15,13 @@
 
 class ProcessorType(str, Enum):
     DROP_COLUMNS = "drop_columns"
+    TO_JSONL = "to_jsonl"
 
 
 class ProcessorConfig(ConfigBase, ABC):
     build_stage: BuildStage = Field(
-        ..., description=f"The stage at which the processor will run. Supported stages: {', '.join(SUPPORTED_STAGES)}"
+        default=BuildStage.POST_BATCH,
+        description=f"The stage at which the processor will run. Supported stages: {', '.join(SUPPORTED_STAGES)}"
     )
 
     @field_validator("build_stage")
@@ -34,8 +36,26 @@ def validate_build_stage(cls, v: BuildStage) -> BuildStage:
 def get_processor_config_from_kwargs(processor_type: ProcessorType, **kwargs) -> ProcessorConfig:
     if processor_type == ProcessorType.DROP_COLUMNS:
         return DropColumnsProcessorConfig(**kwargs)
+    elif processor_type == ProcessorType.TO_JSONL:
+        return ToJsonlProcessorConfig(**kwargs)
 
 
 class DropColumnsProcessorConfig(ProcessorConfig):
     column_names: list[str]
     processor_type: Literal[ProcessorType.DROP_COLUMNS] = ProcessorType.DROP_COLUMNS
+
+
+class ToJsonlProcessorConfig(ProcessorConfig):
+    template: dict = Field(..., description="The template to use for each entry in the dataset.")
+    folder_name: str = Field(..., description="Folder where JSONL files will be saved.")
+    fraction_per_file: dict[str, float] = Field(
+        default={"train.jsonl": 0.8, "validation.jsonl": 0.2},
+        description="Fraction of the dataset to save in each file. The keys are the filenames and the values are the fractions.",
+    )
+    processor_type: Literal[ProcessorType.TO_JSONL] = ProcessorType.TO_JSONL
+
+    @field_validator("fraction_per_file")
+    def validate_fraction_per_file(cls, v: dict[str, float]) -> dict[str, float]:
+        if sum(v.values()) != 1:
+            raise ValueError("The fractions must sum to 1.")
+        return v
diff --git a/src/data_designer/engine/dataset_builders/artifact_storage.py b/src/data_designer/engine/dataset_builders/artifact_storage.py
@@ -32,6 +32,7 @@ class ArtifactStorage(BaseModel):
     final_dataset_folder_name: str = "parquet-files"
     partial_results_folder_name: str = "tmp-partial-parquet-files"
     dropped_columns_folder_name: str = "dropped-columns-parquet-files"
+    outputs_folder_name: str = "outputs"
 
     @property
     def artifact_path_exists(self) -> bool:
@@ -57,6 +58,10 @@ def metadata_file_path(self) -> Path:
     def partial_results_path(self) -> Path:
         return self.base_dataset_path / self.partial_results_folder_name
 
+    @property
+    def outputs_path(self) -> Path:
+        return self.base_dataset_path / self.outputs_folder_name
+
     @field_validator("artifact_path")
     def validate_artifact_path(cls, v: Union[Path, str]) -> Path:
         v = Path(v)
@@ -178,5 +183,10 @@ def write_metadata(self, metadata: dict) -> Path:
             json.dump(metadata, file)
         return self.metadata_file_path
 
+    def move_to_outputs(self, from_path: Path, to_folder_name: str) -> Path:
+        self.mkdir_if_needed(self.outputs_path / to_folder_name)
+        shutil.move(from_path, self.outputs_path / to_folder_name / from_path.name)
+        return self.outputs_path / to_folder_name / from_path.name
+
     def _get_stage_path(self, stage: BatchStage) -> Path:
         return getattr(self, resolve_string_enum(stage, BatchStage).value)
diff --git a/src/data_designer/engine/processing/processors/registry.py b/src/data_designer/engine/processing/processors/registry.py
@@ -5,9 +5,11 @@
 from data_designer.config.processors import (
     DropColumnsProcessorConfig,
     ProcessorType,
+    ToJsonlProcessorConfig,
 )
 from data_designer.engine.processing.processors.base import Processor
 from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor
+from data_designer.engine.processing.processors.to_jsonl import ToJsonlProcessor
 from data_designer.engine.registry.base import TaskRegistry
 
 
@@ -17,4 +19,5 @@ class ProcessorRegistry(TaskRegistry[str, Processor, ConfigBase]): ...
 def create_default_processor_registry() -> ProcessorRegistry:
     registry = ProcessorRegistry()
     registry.register(ProcessorType.DROP_COLUMNS, DropColumnsProcessor, DropColumnsProcessorConfig, False)
+    registry.register(ProcessorType.TO_JSONL, ToJsonlProcessor, ToJsonlProcessorConfig, False)
     return registry
diff --git a/src/data_designer/engine/processing/processors/to_jsonl.py b/src/data_designer/engine/processing/processors/to_jsonl.py
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import logging
+import tempfile
+
+import pandas as pd
+from pathlib import Path
+
+from data_designer.config.processors import ToJsonlProcessorConfig
+from data_designer.engine.configurable_task import ConfigurableTaskMetadata
+from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
+from data_designer.engine.processing.processors.base import Processor
+from data_designer.engine.processing.utils import deserialize_json_values
+
+logger = logging.getLogger(__name__)
+
+
+class ToJsonlProcessor(WithJinja2UserTemplateRendering, Processor[ToJsonlProcessorConfig]):
+    @staticmethod
+    def metadata() -> ConfigurableTaskMetadata:
+        return ConfigurableTaskMetadata(
+            name="to_jsonl",
+            description="Save formatted dataset as JSONL files.",
+            required_resources=None,
+        )
+
+    @property
+    def template_as_string(self) -> str:
+        return json.dumps(self.config.template)
+
+    def _get_stop_index_per_file(self, dataset_size: int) -> dict[str, int]:
+        """Helper function to get the end index for each file of the split."""
+        stop_index_per_file = {}
+
+        accumulated_fraction = 0.0
+        for filename, fraction in self.config.fraction_per_file.items():
+            accumulated_fraction += fraction
+            stop_index_per_file[filename] = min(int(accumulated_fraction * dataset_size), dataset_size)
+
+        return stop_index_per_file
+
+    def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame:
+        self.prepare_jinja2_template_renderer(self.template_as_string, data.columns.to_list())
+
+        stop_index_per_file = self._get_stop_index_per_file(len(data))
+        with tempfile.TemporaryDirectory() as temp_dir:
+            start_index = 0
+            for filename, stop_index in stop_index_per_file.items():
+                logger.info(f"✏️ Writing {stop_index - start_index} formatted JSONL entries to {filename}")
+
+                records = data.iloc[start_index:stop_index].to_dict(orient="records")
+                with open(Path(temp_dir) / f"{filename}", "a") as f:
+                    for i, record in enumerate(records):
+                        rendered_jsonl_entry = self.render_template(deserialize_json_values(record))
+                        escaped_jsonl_entry = rendered_jsonl_entry.replace("\n", "\\n")
+                        f.write(escaped_jsonl_entry)
+                        if i < len(records) - 1:
+                            f.write("\n")
+                start_index = stop_index
+
+                self.artifact_storage.move_to_outputs(Path(temp_dir) / filename, self.config.folder_name)
+
+        return data
diff --git a/src/data_designer/essentials/__init__.py b/src/data_designer/essentials/__init__.py
@@ -34,7 +34,7 @@
     UniformDistribution,
     UniformDistributionParams,
 )
-from ..config.processors import DropColumnsProcessorConfig, ProcessorType
+from ..config.processors import DropColumnsProcessorConfig, ProcessorType, ToJsonlProcessorConfig
 from ..config.sampler_constraints import ColumnInequalityConstraint, ScalarInequalityConstraint
 from ..config.sampler_params import (
     BernoulliMixtureSamplerParams,
@@ -124,6 +124,7 @@
     "SeedDatasetColumnConfig",
     "SubcategorySamplerParams",
     "TimeDeltaSamplerParams",
+    "ToJsonlProcessorConfig",
     "UniformDistribution",
     "UniformDistributionParams",
     "UniformSamplerParams",