Add configurable datasets and validation and shortening the code

Hossein Kavianihamedani · Hossein Kavianihamedani · commit 676db88bc2e5 · 2025-10-17T14:09:05.000-07:00
diff --git a/apps/sft/llama3_8b_test_eval.yaml b/apps/sft/llama3_8b_test_eval.yaml
@@ -0,0 +1,65 @@
+# Test configuration to verify evaluation is working
+# Runs very few steps with frequent evaluation
+
+comm:
+  trace_buf_size: 0
+
+model:
+  name: llama3
+  flavor: 8B
+  hf_assets_path: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct
+
+processes:
+  procs: 8  # Just 2 processes for faster testing
+  with_gpus: true
+
+optimizer:
+  name: AdamW
+  lr: 1e-5
+  eps: 1e-8
+
+
+lr_scheduler:
+  warmup_steps: 2
+
+dataset:
+  path: "yahma/alpaca-cleaned"
+  split: "train[:95%]"
+
+dataset_val:
+  path: "yahma/alpaca-cleaned"
+  split: "train[95%:]"
+
+training:
+  local_batch_size: 4
+  seq_len: 512  # Shorter sequences for speed
+  max_norm: 1.0
+  steps: 100  # Only 10 training steps total
+  compile: false
+
+validation:
+  enabled: true       # Enable/disable validation
+  eval_interval: 100  # Run evaluation every 100 training steps
+  eval_steps: 50      # Number of batches per evaluation (0 = full epoch)
+
+parallelism:
+  data_parallel_replicate_degree: 1
+  data_parallel_shard_degree: -1
+  tensor_parallel_degree: 2
+  pipeline_parallel_degree: 1
+  context_parallel_degree: 1
+  expert_parallel_degree: 1
+  disable_loss_parallel: false
+
+checkpoint:
+  enable: true
+  folder: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/test_eval_checkpoints
+  initial_load_path: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/
+  initial_load_in_hf: true
+  last_save_in_hf: true
+  interval: 100  # Don't save frequently during test
+  async_mode: disabled
+
+activation_checkpoint:
+  mode: selective
+  selective_ac_option: op