Skip to content

Commit 41a46d8

Browse files
wukaixingxpmreso
authored andcommitted
fix alpaca dataset by using 5% of the data as eval and make sure len((eval_loader)>0
1 parent d24ea27 commit 41a46d8

File tree

2 files changed

+8
-2
lines changed

2 files changed

+8
-2
lines changed

src/llama_recipes/datasets/alpaca_dataset.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,12 @@
2626
class InstructionDataset(Dataset):
2727
def __init__(self, dataset_config, tokenizer, partition="train"):
2828
self.ann = json.load(open(dataset_config.data_path))
29+
# Use 5% of the dataset for evaluation
30+
eval_length = int(len(self.ann)/20)
2931
if partition == "train":
30-
self.ann = self.ann[200:]
32+
self.ann = self.ann[eval_length:]
3133
else:
32-
self.ann = self.ann[:200]
34+
self.ann = self.ann[:eval_length]
3335

3436
self.tokenizer = tokenizer
3537

src/llama_recipes/finetuning.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,10 @@ def main(**kwargs):
250250
pin_memory=True,
251251
**val_dl_kwargs,
252252
)
253+
if len(eval_dataloader) == 0:
254+
raise ValueError("The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set.")
255+
else:
256+
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
253257

254258
# Initialize the optimizer and learning rate scheduler
255259
if fsdp_config.pure_bf16 and fsdp_config.optimizer == "anyprecision":

0 commit comments

Comments
 (0)