66from pathlib import Path
77from typing import Any , TypedDict
88
9- from datasets import ClassLabel , Sequence , concatenate_datasets , get_dataset_config_names , load_dataset
9+ from datasets import ClassLabel , Sequence , get_dataset_config_names , load_dataset
1010from datasets import Dataset as HFDataset
1111
1212from autointent .custom_types import LabelType , Split
@@ -39,7 +39,7 @@ class Dataset(dict[str, HFDataset]):
3939
4040 def __init__ (self , * args : Any , intents : list [Intent ], ** kwargs : Any ) -> None : # noqa: ANN401
4141 """
42- Initialize the dataset and configure OOS split if applicable .
42+ Initialize the dataset.
4343
4444 :param args: Positional arguments to initialize the dataset.
4545 :param intents: List of intents associated with the dataset.
@@ -54,10 +54,6 @@ def __init__(self, *args: Any, intents: list[Intent], **kwargs: Any) -> None: #
5454 if self .multilabel :
5555 self ._encode_labels ()
5656
57- oos_split = self ._create_oos_split ()
58- if oos_split is not None :
59- self [Split .OOS ] = oos_split
60-
6157 @property
6258 def multilabel (self ) -> bool :
6359 """
@@ -144,7 +140,10 @@ def to_json(self, filepath: str | Path) -> None:
144140
145141 :param filepath: The path to the file where the JSON data will be saved.
146142 """
147- with Path (filepath ).open ("w" ) as file :
143+ path = Path (filepath )
144+ if not path .parent .exists ():
145+ path .parent .mkdir (parents = True )
146+ with path .open ("w" ) as file :
148147 json .dump (self .to_dict (), file , indent = 4 , ensure_ascii = False )
149148
150149 def push_to_hub (self , repo_id : str , private : bool = False ) -> None :
@@ -204,15 +203,6 @@ def _encode_labels(self) -> "Dataset":
204203 self ._encoded_labels = True
205204 return self
206205
207- def _is_oos (self , sample : Sample ) -> bool :
208- """
209- Check if a sample is out-of-scope.
210-
211- :param sample: The sample to check.
212- :return: True if the sample is out-of-scope, False otherwise.
213- """
214- return sample ["label" ] is None
215-
216206 def _to_multilabel (self , sample : Sample ) -> Sample :
217207 """
218208 Convert a sample's label to multilabel format.
@@ -241,20 +231,6 @@ def _encode_label(self, sample: Sample) -> Sample:
241231 sample ["label" ] = one_hot_label
242232 return sample
243233
244- def _create_oos_split (self ) -> HFDataset | None :
245- """
246- Create an out-of-scope (OOS) split from the dataset.
247-
248- :return: The OOS split if created, None otherwise.
249- """
250- oos_splits = [split .filter (self ._is_oos ) for split in self .values ()]
251- oos_splits = [oos_split for oos_split in oos_splits if oos_split .num_rows ]
252- if oos_splits :
253- for split_name , split in self .items ():
254- self [split_name ] = split .filter (lambda sample : not self ._is_oos (sample ))
255- return concatenate_datasets (oos_splits )
256- return None
257-
258234 def _cast_label_feature (self ) -> None :
259235 """Cast the label feature of the dataset to the appropriate type."""
260236 for split_name , split in self .items ():
0 commit comments