Skip to content

Commit 645e1fe

Browse files
authored
DownloadAndPrepareHuggingFaceDatasetJob, trust_remote_code, some doc ext (#604)
1 parent 3603013 commit 645e1fe

File tree

1 file changed

+22
-12
lines changed

1 file changed

+22
-12
lines changed

datasets/huggingface.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44

55
from typing import Optional, Any, Union
6-
from sisyphus import *
6+
from sisyphus import Job, Task, gs
77
from sisyphus.delayed_ops import DelayedBase
88

99
from i6_core.util import instanciate_delayed
@@ -24,7 +24,7 @@ class DownloadAndPrepareHuggingFaceDatasetJob(Job):
2424
https://github.com/huggingface/datasets/issues/4179
2525
"""
2626

27-
__sis_hash_exclude__ = {"split": None, "token": None}
27+
__sis_hash_exclude__ = {"split": None, "token": None, "trust_remote_code": None}
2828

2929
def __init__(
3030
self,
@@ -35,22 +35,30 @@ def __init__(
3535
revision: Optional[str] = None,
3636
split: Optional[str] = None,
3737
token: Optional[Union[str, bool]] = None,
38+
trust_remote_code: Optional[bool] = None,
3839
time_rqmt: float = 1,
3940
mem_rqmt: float = 2,
4041
cpu_rqmt: int = 2,
4142
mini_task: bool = True,
4243
):
4344
"""
44-
:param path: Path or name of the dataset, parameter passed to `Dataset.load_dataset`
45-
:param name: Name of the dataset configuration, parameter passed to `Dataset.load_dataset`
46-
:param data_files: Path(s) to the source data file(s), parameter passed to `Dataset.load_dataset`
47-
:param revision: Version of the dataset script, parameter passed to `Dataset.load_dataset`
48-
:param split: Specifies the split to download e.g "test", parameter passed to `Dataset.load_dataset`
49-
:param token: To use as Bearer token for remote files on the Datasets Hub, parameter passed to `Dataset.load_dataset`
50-
:param float time_rqmt:
51-
:param float mem_rqmt:
52-
:param int cpu_rqmt:
53-
:param bool mini_task: the job should be run as mini_task
45+
:param path: Path or name of the dataset, parameter passed to :func:`load_dataset`
46+
:param name: Name of the dataset configuration, parameter passed to :func:`load_dataset`
47+
:param data_files: Path(s) to the source data file(s), parameter passed to :func:`load_dataset`
48+
:param revision: Version of the dataset script, parameter passed to :func:`load_dataset`
49+
:param split: Specifies the split to download e.g "test", parameter passed to :func:`load_dataset`
50+
:param token: To use as Bearer token for remote files on the Datasets Hub, parameter passed to :func:`load_dataset`
51+
If set to True, or if unset, it will use the standard HF methods to determine the token.
52+
E.g. it will look for the HF_TOKEN env var,
53+
or it will look into the HF home dir (set via HF_HOME env, or as default ~/.cache/huggingface).
54+
Do ``python -m huggingface_hub.commands.huggingface_cli login``.
55+
See HF :func:`get_token`.
56+
You should *not* set some token in public recipes.
57+
:param trust_remote_code: whether to trust remote code, parameter passed to :func:`load_dataset`
58+
:param time_rqmt:
59+
:param mem_rqmt:
60+
:param cpu_rqmt:
61+
:param mini_task: the job should be run as mini_task
5462
"""
5563
super().__init__()
5664
self.path = path
@@ -59,6 +67,7 @@ def __init__(
5967
self.revision = revision
6068
self.split = split
6169
self.token = token
70+
self.trust_remote_code = trust_remote_code
6271

6372
self.rqmt = {"cpu": cpu_rqmt, "mem": mem_rqmt, "time": time_rqmt}
6473
self.mini_task = mini_task
@@ -81,6 +90,7 @@ def run(self):
8190
cache_dir=tmp_dir,
8291
split=self.split,
8392
token=self.token,
93+
**({"trust_remote_code": self.trust_remote_code} if self.trust_remote_code is not None else {}),
8494
)
8595
print("Dataset:")
8696
print(ds)

0 commit comments

Comments
 (0)