33"""
44
55from typing import Optional , Any , Union
6- from sisyphus import *
6+ from sisyphus import Job , Task , gs
77from sisyphus .delayed_ops import DelayedBase
88
99from i6_core .util import instanciate_delayed
@@ -24,7 +24,7 @@ class DownloadAndPrepareHuggingFaceDatasetJob(Job):
2424 https://github.com/huggingface/datasets/issues/4179
2525 """
2626
27- __sis_hash_exclude__ = {"split" : None , "token" : None }
27+ __sis_hash_exclude__ = {"split" : None , "token" : None , "trust_remote_code" : None }
2828
2929 def __init__ (
3030 self ,
@@ -35,22 +35,30 @@ def __init__(
3535 revision : Optional [str ] = None ,
3636 split : Optional [str ] = None ,
3737 token : Optional [Union [str , bool ]] = None ,
38+ trust_remote_code : Optional [bool ] = None ,
3839 time_rqmt : float = 1 ,
3940 mem_rqmt : float = 2 ,
4041 cpu_rqmt : int = 2 ,
4142 mini_task : bool = True ,
4243 ):
4344 """
44- :param path: Path or name of the dataset, parameter passed to `Dataset.load_dataset`
45- :param name: Name of the dataset configuration, parameter passed to `Dataset.load_dataset`
46- :param data_files: Path(s) to the source data file(s), parameter passed to `Dataset.load_dataset`
47- :param revision: Version of the dataset script, parameter passed to `Dataset.load_dataset`
48- :param split: Specifies the split to download e.g "test", parameter passed to `Dataset.load_dataset`
49- :param token: To use as Bearer token for remote files on the Datasets Hub, parameter passed to `Dataset.load_dataset`
50- :param float time_rqmt:
51- :param float mem_rqmt:
52- :param int cpu_rqmt:
53- :param bool mini_task: the job should be run as mini_task
45+ :param path: Path or name of the dataset, parameter passed to :func:`load_dataset`
46+ :param name: Name of the dataset configuration, parameter passed to :func:`load_dataset`
47+ :param data_files: Path(s) to the source data file(s), parameter passed to :func:`load_dataset`
48+ :param revision: Version of the dataset script, parameter passed to :func:`load_dataset`
49+ :param split: Specifies the split to download e.g "test", parameter passed to :func:`load_dataset`
50+ :param token: To use as Bearer token for remote files on the Datasets Hub, parameter passed to :func:`load_dataset`
51+ If set to True, or if unset, it will use the standard HF methods to determine the token.
52+ E.g. it will look for the HF_TOKEN env var,
53+ or it will look into the HF home dir (set via HF_HOME env, or as default ~/.cache/huggingface).
54+ Do ``python -m huggingface_hub.commands.huggingface_cli login``.
55+ See HF :func:`get_token`.
56+ You should *not* set some token in public recipes.
57+ :param trust_remote_code: whether to trust remote code, parameter passed to :func:`load_dataset`
58+ :param time_rqmt:
59+ :param mem_rqmt:
60+ :param cpu_rqmt:
61+ :param mini_task: the job should be run as mini_task
5462 """
5563 super ().__init__ ()
5664 self .path = path
@@ -59,6 +67,7 @@ def __init__(
5967 self .revision = revision
6068 self .split = split
6169 self .token = token
70+ self .trust_remote_code = trust_remote_code
6271
6372 self .rqmt = {"cpu" : cpu_rqmt , "mem" : mem_rqmt , "time" : time_rqmt }
6473 self .mini_task = mini_task
@@ -81,6 +90,7 @@ def run(self):
8190 cache_dir = tmp_dir ,
8291 split = self .split ,
8392 token = self .token ,
93+ ** ({"trust_remote_code" : self .trust_remote_code } if self .trust_remote_code is not None else {}),
8494 )
8595 print ("Dataset:" )
8696 print (ds )
0 commit comments