Skip to content

Commit 7258505

Browse files
authored
use default dataprocessor if one is not provided (#590)
Signed-off-by: Dushyant Behl <dushyantbehl@in.ibm.com>
1 parent 048a0e8 commit 7258505

File tree

2 files changed

+10
-2
lines changed

2 files changed

+10
-2
lines changed

tests/artifacts/predefined_data_configs/pretokenized_data.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
dataprocessor:
2-
type: default
31
datasets:
42
- name: pretokenized_dataset
53
data_paths:

tuning/data/data_config.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,11 +188,21 @@ def load_and_validate_data_config(data_config_file: str) -> DataConfig:
188188
assert isinstance(
189189
raw_data["datasets"], list
190190
), "datasets should be provided as a list"
191+
191192
datasets = []
193+
dataprocessor = None
194+
192195
for d in raw_data["datasets"]:
193196
datasets.append(_validate_dataset_config(d))
194197
if "dataprocessor" in raw_data:
195198
dataprocessor = _validate_dataprocessor_config(raw_data["dataprocessor"])
196199

200+
if dataprocessor is None:
201+
logging.info(
202+
"`dataprocessor` filed is absent from data config. Using default dataprocessor"
203+
)
204+
dataprocessor = DataPreProcessorConfig()
205+
logging.info("Default datapreprocessor is %s", str(dataprocessor))
206+
197207
data_config = DataConfig(dataprocessor=dataprocessor, datasets=datasets)
198208
return data_config

0 commit comments

Comments
 (0)