-
Notifications
You must be signed in to change notification settings - Fork 467
Expand file tree
/
Copy pathc4.py
More file actions
26 lines (19 loc) · 909 Bytes
/
c4.py
File metadata and controls
26 lines (19 loc) · 909 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from copy import deepcopy
from typing import TYPE_CHECKING
from llmcompressor.transformers.finetune.data import TextGenerationDataset
from llmcompressor.typing import Processor
if TYPE_CHECKING:
from llmcompressor.arg_parser import DatasetArguments
@TextGenerationDataset.register(name="c4")
class C4Dataset(TextGenerationDataset):
"""
Child text generation class for the C4 dataset
:param data_args: configuration settings for dataset loading
:param split: split from dataset to load, for instance `test` or `train[:5%]`
:param processor: processor or tokenizer to use on dataset
"""
def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
data_args = deepcopy(data_args)
data_args.dataset = "allenai/c4"
data_args.text_column = "text"
super().__init__(data_args=data_args, split=split, processor=processor)