-
Notifications
You must be signed in to change notification settings - Fork 457
Expand file tree
/
Copy pathevolcodealpaca.py
More file actions
44 lines (34 loc) · 1.5 KB
/
evolcodealpaca.py
File metadata and controls
44 lines (34 loc) · 1.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from copy import deepcopy
from typing import TYPE_CHECKING
from llmcompressor.transformers.finetune.data import TextGenerationDataset
from llmcompressor.typing import Processor
if TYPE_CHECKING:
from llmcompressor.transformers.utils.arg_parser import DatasetArguments
@TextGenerationDataset.register(name="evolcodealpaca")
class EvolCodeAlpacaDataset(TextGenerationDataset):
"""
Child text generation class for the Evol Code Alpaca dataset
:param data_args: configuration settings for dataset loading
:param split: split from dataset to load, for instance `test` or `train[:5%]`
:param processor: processor or tokenizer to use on dataset
"""
EVOL_ALPACA_TEMPLATE = (
"Below is an instruction that describes a "
"programming task. Write a program that appropriately "
"completes the request.\n\n### Instruction:\n{instruction}"
"\n\n### Response:\n"
)
def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
data_args = deepcopy(data_args)
data_args.dataset = "theblackcat102/evol-codealpaca-v1"
data_args.text_column = "text"
super().__init__(data_args, split=split, processor=processor)
def dataset_template(self, sample):
prompt = self.EVOL_ALPACA_TEMPLATE.format(instruction=sample["instruction"])
text = prompt
if "output" in text:
text += sample["output"]
return {
"text": text,
self.PROMPT_KEY: prompt,
}