Skip to content

Commit 6d85bf1

Browse files
committed
Elevate another global variable to the RunConfig
1 parent 93f71f8 commit 6d85bf1

File tree

7 files changed

+18
-16
lines changed

7 files changed

+18
-16
lines changed

docs/code_reference/run_config.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Run Config
2+
3+
The `run_config` module defines runtime settings that control dataset generation behavior,
4+
including early shutdown thresholds, batch sizing, and non-LLM worker concurrency.
5+
6+
:::: data_designer.config.run_config

mkdocs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ nav:
4343
- column_configs: code_reference/column_configs.md
4444
- config_builder: code_reference/config_builder.md
4545
- data_designer_config: code_reference/data_designer_config.md
46+
- run_config: code_reference/run_config.md
4647
- sampler_params: code_reference/sampler_params.md
4748
- validator_params: code_reference/validator_params.md
4849
- processors: code_reference/processors.md

src/data_designer/config/run_config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ class RunConfig(ConfigBase):
2626
buffer_size: Number of records to process in each batch during dataset generation.
2727
A batch is processed end-to-end (column generation, post-batch processors, and writing the batch
2828
to artifact storage) before moving on to the next batch. Must be > 0. Default is 1000.
29+
non_llm_max_parallel_workers: Maximum number of worker threads used for non-LLM
30+
cell-by-cell generators. Must be >= 1. Default is 4.
2931
max_conversation_restarts: Maximum number of full conversation restarts permitted when
3032
generation tasks call `ModelFacade.generate(...)`. Must be >= 0. Default is 5.
3133
max_conversation_correction_steps: Maximum number of correction rounds permitted within a
@@ -37,6 +39,7 @@ class RunConfig(ConfigBase):
3739
shutdown_error_rate: float = Field(default=0.5, ge=0.0, le=1.0)
3840
shutdown_error_window: int = Field(default=10, ge=0)
3941
buffer_size: int = Field(default=1000, gt=0)
42+
non_llm_max_parallel_workers: int = Field(default=4, ge=1)
4043
max_conversation_restarts: int = Field(default=5, ge=0)
4144
max_conversation_correction_steps: int = Field(default=0, ge=0)
4245

src/data_designer/engine/dataset_builders/column_wise_builder.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,7 @@
3131
from data_designer.engine.dataset_builders.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage
3232
from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError
3333
from data_designer.engine.dataset_builders.multi_column_configs import MultiColumnConfig
34-
from data_designer.engine.dataset_builders.utils.concurrency import (
35-
MAX_CONCURRENCY_PER_NON_LLM_GENERATOR,
36-
ConcurrentThreadExecutor,
37-
)
34+
from data_designer.engine.dataset_builders.utils.concurrency import ConcurrentThreadExecutor
3835
from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs
3936
from data_designer.engine.dataset_builders.utils.dataset_batch_manager import DatasetBatchManager
4037
from data_designer.engine.models.telemetry import InferenceEvent, NemoSourceEnum, TaskStatusEnum, TelemetryHandler
@@ -202,7 +199,7 @@ def _run_from_scratch_column_generator(self, generator: ColumnGenerator) -> None
202199
self.batch_manager.add_records(df.to_dict(orient="records"))
203200

204201
def _run_cell_by_cell_generator(self, generator: ColumnGenerator) -> None:
205-
max_workers = MAX_CONCURRENCY_PER_NON_LLM_GENERATOR
202+
max_workers = self._resource_provider.run_config.non_llm_max_parallel_workers
206203
if isinstance(generator, ColumnGeneratorWithModel):
207204
max_workers = generator.inference_parameters.max_parallel_requests
208205
self._fan_out_with_threads(generator, max_workers=max_workers)

src/data_designer/engine/dataset_builders/utils/concurrency.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,6 @@
1616

1717
logger = logging.getLogger(__name__)
1818

19-
# Constants
20-
MAX_CONCURRENCY_PER_NON_LLM_GENERATOR = 4
21-
22-
2319
class ExecutorResults(BaseModel):
2420
failure_threshold: float = 0.0 # Error rate threshold
2521
completed_count: int = 0 # How many tasks/jobs completed

src/data_designer/interface/data_designer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,8 @@ def set_run_config(self, run_config: RunConfig) -> None:
316316
317317
Args:
318318
run_config: A RunConfig instance containing runtime settings such as
319-
early shutdown behavior and batch sizing via `buffer_size`. Import RunConfig from
319+
early shutdown behavior, batch sizing via `buffer_size`, and non-LLM worker
320+
concurrency via `non_llm_max_parallel_workers`. Import RunConfig from
320321
data_designer.essentials.
321322
322323
Example:

tests/engine/dataset_builders/test_column_wise_builder.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,7 @@
1515
from data_designer.config.run_config import RunConfig
1616
from data_designer.config.sampler_params import SamplerType, UUIDSamplerParams
1717
from data_designer.engine.column_generators.generators.base import GenerationStrategy
18-
from data_designer.engine.dataset_builders.column_wise_builder import (
19-
MAX_CONCURRENCY_PER_NON_LLM_GENERATOR,
20-
ColumnWiseDatasetBuilder,
21-
)
18+
from data_designer.engine.dataset_builders.column_wise_builder import ColumnWiseDatasetBuilder
2219
from data_designer.engine.dataset_builders.errors import DatasetGenerationError
2320
from data_designer.engine.models.telemetry import InferenceEvent, NemoSourceEnum, TaskStatusEnum
2421
from data_designer.engine.models.usage import ModelUsageStats, TokenUsageStats
@@ -243,8 +240,9 @@ def test_column_wise_dataset_builder_initialize_processors(stub_column_wise_buil
243240
assert processors[BuildStage.POST_BATCH][0].config.column_names == ["column_to_drop"]
244241

245242

246-
def test_constants_max_concurrency_constant():
247-
assert MAX_CONCURRENCY_PER_NON_LLM_GENERATOR == 4
243+
def test_run_config_default_non_llm_max_parallel_workers() -> None:
244+
run_config = RunConfig()
245+
assert run_config.non_llm_max_parallel_workers == 4
248246

249247

250248
@patch("data_designer.engine.dataset_builders.column_wise_builder.TelemetryHandler")

0 commit comments

Comments
 (0)