Skip to content

Commit b6b0b6f

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents a407c81 + 5d41c7a commit b6b0b6f

File tree

3 files changed

+23
-26
lines changed

3 files changed

+23
-26
lines changed

tests/kfto/core/hf_llm_training.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
import json
2222
import os
2323

24-
from datasets import load_from_disk, Dataset
24+
from datasets import load_dataset, Dataset
2525
from datasets.distributed import split_dataset_by_node
2626
from peft import LoraConfig, get_peft_model
2727
import transformers
@@ -71,28 +71,25 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir):
7171
return model, tokenizer
7272

7373
# This function is a modified version of the original.
74-
def load_and_preprocess_data(dataset_dir, transformer_type, tokenizer):
74+
def load_and_preprocess_data(dataset_file, transformer_type, tokenizer):
7575
# Load and preprocess the dataset
7676
logger.info("Load and preprocess dataset")
7777

78-
file_path = os.path.realpath(dataset_dir)
78+
file_path = os.path.realpath(dataset_file)
7979

80-
if transformer_type != AutoModelForImageClassification:
81-
dataset = load_from_disk(file_path)
80+
dataset=load_dataset('json',data_files=file_path)
8281

82+
if transformer_type != AutoModelForImageClassification:
8383
logger.info(f"Dataset specification: {dataset}")
8484
logger.info("-" * 40)
8585

8686
logger.info("Tokenize dataset")
8787
# TODO (andreyvelich): Discuss how user should set the tokenizer function.
88-
num_cores = os.cpu_count()
8988
dataset = dataset.map(
90-
lambda x: tokenizer(x["text"], padding=True, truncation=True, max_length=128),
89+
lambda x: tokenizer(x["output"], padding=True, truncation=True, max_length=128),
9190
batched=True,
92-
num_proc=num_cores
91+
keep_in_memory=True
9392
)
94-
else:
95-
dataset = load_from_disk(file_path)
9693

9794
# Check if dataset contains `train` key. Otherwise, load full dataset to train_data.
9895
if "train" in dataset:
@@ -175,7 +172,7 @@ def parse_arguments():
175172
parser.add_argument("--model_uri", help="model uri")
176173
parser.add_argument("--transformer_type", help="model transformer type")
177174
parser.add_argument("--model_dir", help="directory containing model")
178-
parser.add_argument("--dataset_dir", help="directory containing dataset")
175+
parser.add_argument("--dataset_file", help="dataset file path")
179176
parser.add_argument("--lora_config", help="lora_config")
180177
parser.add_argument(
181178
"--training_parameters", help="hugging face training parameters"
@@ -197,7 +194,7 @@ def parse_arguments():
197194

198195
logger.info("Preprocess dataset")
199196
train_data, eval_data = load_and_preprocess_data(
200-
args.dataset_dir, transformer_type, tokenizer
197+
args.dataset_file, transformer_type, tokenizer
201198
)
202199

203200
logger.info("Setup LoRA config for model")

tests/kfto/core/kfto_training_test.go

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -106,26 +106,16 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
106106
},
107107
{
108108
Name: "copy-dataset",
109-
Image: "registry.access.redhat.com/ubi9/python-311:9.5-1730564330",
109+
Image: GetAlpacaDatasetImage(),
110110
ImagePullPolicy: corev1.PullIfNotPresent,
111111
VolumeMounts: []corev1.VolumeMount{
112112
{
113113
Name: "tmp-volume",
114114
MountPath: "/tmp",
115115
},
116116
},
117-
Command: []string{
118-
"/bin/sh",
119-
"-c",
120-
`pip install --target /tmp/.local datasets && \
121-
HF_HOME=/tmp/.cache PYTHONPATH=/tmp/.local python -c "from datasets import load_dataset; dataset = load_dataset('tatsu-lab/alpaca', split='train[:100]'); dataset.save_to_disk('/tmp/dataset')"`,
122-
},
123-
Env: []corev1.EnvVar{
124-
{
125-
Name: "HF_HOME",
126-
Value: "/tmp/.cache",
127-
},
128-
},
117+
Command: []string{"/bin/sh", "-c"},
118+
Args: []string{"mkdir /tmp/all_datasets; cp -r /dataset/* /tmp/all_datasets;ls /tmp/all_datasets"},
129119
},
130120
},
131121
Containers: []corev1.Container{
@@ -138,7 +128,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
138128
`python /etc/config/hf_llm_training.py \
139129
--model_uri /tmp/model/bloom-560m \
140130
--model_dir /tmp/model/bloom-560m \
141-
--dataset_dir /tmp/dataset \
131+
--dataset_file /tmp/all_datasets/alpaca_data_hundredth.json \
142132
--transformer_type AutoModelForCausalLM \
143133
--training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch"}' \
144134
--lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'`,

tests/kfto/upgrade/kfto_kueue_sft_upgrade_training_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,10 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
186186
Name: "tmp-volume",
187187
MountPath: "/tmp",
188188
},
189+
{
190+
Name: "output-volume",
191+
MountPath: "/mnt/output",
192+
},
189193
},
190194
Resources: corev1.ResourceRequirements{
191195
Requests: corev1.ResourceList{
@@ -226,6 +230,12 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
226230
EmptyDir: &corev1.EmptyDirVolumeSource{},
227231
},
228232
},
233+
{
234+
Name: "output-volume",
235+
VolumeSource: corev1.VolumeSource{
236+
EmptyDir: &corev1.EmptyDirVolumeSource{},
237+
},
238+
},
229239
},
230240
},
231241
},

0 commit comments

Comments
 (0)