NVIDIA · jstjohn · Jan 14, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
@@ -19,32 +19,95 @@ uv pip install -c pip-constraints.txt -e . --no-build-isolation
 
 ## Usage
 
+### Example job
+
 ```
 # 3. Run an example job
 ## 2. if on a6000s, you may need to disable p2p to avoid crashing
 export NCCL_P2P_DISABLE=1
 ## 3. Run the job:
-torchrun --nproc-per-node 8 --no-python \
+torchrun --nproc-per-node 2 --no-python \
   train_evo2 \
   --hf-tokenizer-model-path tokenizers/nucleotide_fast_tokenizer_256 \
   --model-size striped_hyena_1b_nv_parallel --max-steps 12 --eval-interval 10 \
   --eval-iters 3 --mock-data \
-  --micro-batch-size 32 --global-batch-size 256 --seq-length 1024 \
+  --micro-batch-size 16 --global-batch-size 32 --seq-length 1024 \
   --tensor-model-parallel 1 \
   --use-precision-aware-optimizer --dataset-seed 33 \
-  --seed 41 --ckpt-async-save  --spike-no-more-embedding-init \
+  --seed 41 --spike-no-more-embedding-init \
   --no-weight-decay-embeddings --cross-entropy-loss-fusion \
   --align-param-gather --overlap-param-gather  --grad-reduce-in-fp32 \
   --decay-steps 100 --warmup-steps 10 \
-  --mixed-precision-recipe bf16-mixed \
+  --mixed-precision-recipe bf16_with_fp8_current_scaling_mixed \
   --no-fp32-residual-connection --activation-checkpoint-recompute-num-layers 1 \
   --attention-dropout 0.001 --hidden-dropout 0.001 \
   --eod-pad-in-loss-mask --enable-preemption \
   --log-interval 5 --debug-ddp-parity-freq 10 \
-  --wandb-project evo2-recipes-verification-tmp \
-  --wandb-run-name tmp_workstation_run_mock_data \
-  --result-dir tmpbf16 --no-renormalize-loss
+  --result-dir tmpfp8 --no-renormalize-loss
+```
+
+### Example fine-tune from an existing checkpoint
+
+First convert the checkpoint from nemo2 format (temporary step until we upload the new files)
+
+Good checkpoint names to try are:
+
+- evo2/1b-8k-bf16:1.0 (model_size: 1b)
+- evo2/7b-1m:1.0 (model_size: 7b_arc_longcontext)
+- evo2/40b-1m-fp8-bf16:1.0 (model_size: 40b_arc_longcontext)
+
+Other than the 7b version, the other two are checkpoints fine-tuned by the BioNeMo team to support both FP8 and BF16
+precision. The 7b version worked well on both FP8 and BF16 out of the box so it was not fine-tuned further. If you do
+want to use one of the FP8 sensitive checkpoints, like `evo2/40b-1m` then be sure to add the `--vortex-style-fp8`
+option to the checkpoint conversion step below. Also note that although 8k versions of the 7b and 40b checkpoints exist,
+it is advisable to use the longer context versions since they were trained further and still run on shorter inputs.
+
+See `download_bionemo_data --list-resources` for other checkpoint options and a list of available
+downloadable resources.
+
+```
+CKPT_NAME=evo2/1b-8k-bf16:1.0
+CKPT_OUT_DIR=evo2_1b_8k_bf16_mbridge
+evo2_convert_nemo2_to_mbridge \
+  --mixed-precision-recipe bf16_with_fp8_current_scaling_mixed \
+  --tokenizer-path tokenizers/nucleotide_fast_tokenizer_512 \
+  --model-size 1b \
+  --seq-length 8192 \
+  --nemo2-ckpt-dir $(download_bionemo_data $CKPT_NAME) \
+  --mbridge-ckpt-dir $CKPT_OUT_DIR
+
+```
+
+Now run like before, but include the fine-tuned checkpoint directory you converted in the previous step with
+`--finetune-ckpt-dir $CKPT_OUT_DIR`. Also if you have problems with `bf16_with_fp8_current_scaling_mixed` try
+`bf16_mixed`.
+
 ```
+torchrun --nproc-per-node 2 --no-python \
+  train_evo2 \
+  --hf-tokenizer-model-path tokenizers/nucleotide_fast_tokenizer_512 \
+  --model-size 1b --max-steps 12 --eval-interval 10 \
+  --eval-iters 3 --mock-data \
+  --micro-batch-size 16 --global-batch-size 32 --seq-length 1024 \
+  --tensor-model-parallel 1 \
+  --use-precision-aware-optimizer --dataset-seed 33 \
+  --seed 41 \
+  --cross-entropy-loss-fusion \
+  --align-param-gather --overlap-param-gather  --grad-reduce-in-fp32 \
+  --decay-steps 100 --warmup-steps 10 \
+  --mixed-precision-recipe bf16_with_fp8_current_scaling_mixed \
+  --no-fp32-residual-connection --activation-checkpoint-recompute-num-layers 1 \
+  --attention-dropout 0.001 --hidden-dropout 0.001 \
+  --eod-pad-in-loss-mask --enable-preemption \
+  --log-interval 5 --debug-ddp-parity-freq 10 \
+  --result-dir tmpfp8-ft-example --no-renormalize-loss \
+  --finetune-ckpt-dir $CKPT_OUT_DIR
+```
+
+## Where do the custom command line programs come from?
+
+See `pyproject.toml` for where runnable programs like `train_evo2` and `evo2_convert_nemo2_to_mbridge` are implemented
+in code.
 
 ## Docker build
 

@@ -40,6 +40,7 @@ train_evo2 = "bionemo.evo2.run.train:main"
 #predict_evo2 = "bionemo.evo2.run.predict:main"
 preprocess_evo2 = "bionemo.evo2.data.preprocess:main"
 splice_evo2 = "bionemo.evo2.data.transcript_extraction:main"
+evo2_convert_nemo2_to_mbridge = "bionemo.evo2.utils.checkpoint.nemo2_to_mbridge:main"
 #evo2_convert_to_nemo2 = "bionemo.evo2.utils.checkpoint.convert_to_nemo:main"
 #evo2_nemo2_to_hf = "bionemo.evo2.utils.checkpoint.nemo2_to_hf:main"
 #evo2_remove_optimizer = "bionemo.evo2.utils.checkpoint.evo2_remove_optimizer:main"

@@ -267,7 +267,6 @@ def _evo2_common(
         ),
         tokenizer=TokenizerConfig(
             tokenizer_type="HuggingFaceTokenizer",
-            hf_tokenizer_kwargs={"trust_remote_code": True},
             tokenizer_model=hf_tokenizer_model_or_path or "EleutherAI/gpt-neox-20b",
         ),
         checkpoint=CheckpointConfig(

@@ -710,9 +710,9 @@ def train(args: argparse.Namespace) -> None:
         recipe_kwargs["stride"] = args.stride
         recipe_kwargs["window_min_length_threshold"] = args.window_min_length_threshold
         recipe_kwargs["rc_aug"] = args.rc_aug
-    elif args.dataset_config_path:
+    elif args.dataset_config:
         recipe_kwargs["dataset_dir"] = args.dataset_dir
-        recipe_kwargs["dataset_config_path"] = args.dataset_config_path
+        recipe_kwargs["dataset_config_path"] = args.dataset_config
 
     recipe_kwargs["pad_eod_loss_mask"] = args.eod_pad_in_loss_mask
 
@@ -918,6 +918,7 @@ def train(args: argparse.Namespace) -> None:
     if args.finetune_ckpt_dir:
         cfg.checkpoint.finetune = True
         cfg.checkpoint.pretrained_checkpoint = args.finetune_ckpt_dir
+        cfg.checkpoint.dist_ckpt_strictness = "ignore_all"  # necessary unfortunately to avoid extra_state issues.
     if args.nvidia_fault_tolerance:
         cfg.ft = FaultToleranceConfig(
             enable_ft_package=True,