NVIDIA
diff --git a/‎.secrets.baseline‎
Lines changed: 10 additions & 1 deletion b/‎.secrets.baseline‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎bionemo-recipes/models/esm2/README.md‎
Lines changed: 96 additions & 0 deletions b/‎bionemo-recipes/models/esm2/README.md‎
Lines changed: 96 additions & 0 deletions
@@ -142,6 +142,15 @@
     }
   ],
   "results": {
+    "bionemo-recipes/recipes/esm2_native_te/tests/test_train.py": [
+      {
+        "type": "Base64 High Entropy String",
+        "filename": "bionemo-recipes/recipes/esm2_native_te/tests/test_train.py",
+        "hashed_secret": "76fc9c9f16bca9a436a5fcede09cc3593a4bf6f0",
+        "is_verified": false,
+        "line_number": 511
+      }
+    ],
     "pyproject.toml": [
       {
         "type": "Hex High Entropy String",
@@ -152,5 +161,5 @@
       }
     ]
   },
-  "generated_at": "2025-12-29T20:49:21Z"
+  "generated_at": "2026-03-09T20:44:27Z"
 }
@@ -69,6 +69,102 @@ Training recipes are available in the `bionemo-recipes/recipes/` directory:
 - **[esm2_accelerate_te](../../recipes/esm2_accelerate_te/)** - Trains the model using HuggingFace
   [Accelerate](https://huggingface.co/docs/accelerate/index).
 
+## Running with Low Precision (FP8/FP4)
+
+The TE-optimized ESM-2 model supports per-layer quantization via two mechanisms: a **config-level**
+`layer_precision` list that declares which layers use which precision, and **constructor-level** recipe
+objects (`fp8_recipe`, `fp4_recipe`) that control the quantization behaviour.
+
+### Configuration: `layer_precision`
+
+`NVEsmConfig.layer_precision` is a list of length `num_hidden_layers` where each element is `"fp8"`,
+`"fp4"`, or `None` (BF16 fallback). When set, it controls the `te.autocast` context used for each
+transformer layer during both initialization and forward pass.
+
+```python
+from modeling_esm_te import NVEsmConfig, NVEsmForMaskedLM
+
+# All layers in FP8
+config = NVEsmConfig.from_pretrained(
+    "nvidia/esm2_t6_8M_UR50D",
+    layer_precision=["fp8"] * 6,
+)
+```
+
+If you pass an `fp8_recipe` to the model constructor **without** setting `layer_precision`, it
+defaults to `["fp8"] * num_hidden_layers` (all layers FP8). You can also mix precisions, for example
+running most layers in FP8 but keeping the first and last layers in BF16:
+
+```python
+layer_precision = [None] + ["fp8"] * 4 + [None]
+config = NVEsmConfig.from_pretrained(
+    "nvidia/esm2_t6_8M_UR50D",
+    layer_precision=layer_precision,
+)
+```
+
+### Constructor arguments: `fp8_recipe` and `fp4_recipe`
+
+The model classes (`NVEsmModel`, `NVEsmForMaskedLM`, `NVEsmForTokenClassification`) accept
+`fp8_recipe` and `fp4_recipe` keyword arguments. These are `transformer_engine.common.recipe.Recipe`
+objects that configure the quantization algorithm (e.g., delayed scaling, block scaling, MXFP8).
+
+```python
+import transformer_engine.common.recipe as te_recipe
+
+from modeling_esm_te import NVEsmConfig, NVEsmForMaskedLM
+
+fp8_recipe = te_recipe.DelayedScaling()
+
+config = NVEsmConfig.from_pretrained(
+    "nvidia/esm2_t6_8M_UR50D",
+    layer_precision=["fp8"] * 6,
+)
+model = NVEsmForMaskedLM(config, fp8_recipe=fp8_recipe)
+```
+
+For FP4 (NVFP4) quantization, pass an `fp4_recipe` instead and set the corresponding layers to
+`"fp4"` in `layer_precision`:
+
+```python
+fp4_recipe = te_recipe.NVFP4BlockScaling()
+
+config = NVEsmConfig.from_pretrained(
+    "nvidia/esm2_t6_8M_UR50D",
+    layer_precision=["fp4"] * 6,
+)
+model = NVEsmForMaskedLM(config, fp4_recipe=fp4_recipe)
+```
+
+You can also mix FP8 and FP4 layers by providing both recipes and a mixed `layer_precision` list.
+
+### Quantized model initialization: `use_quantized_model_init`
+
+When `use_quantized_model_init=True` is set in the config, layers are created inside a
+`te.quantized_model_init` context. This tells TransformerEngine to initialize weights directly in
+the target quantized format, avoiding a separate quantization step after initialization. This is
+primarily useful when loading pre-quantized checkpoints.
+
+```python
+config = NVEsmConfig.from_pretrained(
+    "nvidia/esm2_t6_8M_UR50D",
+    layer_precision=["fp4"] * 6,
+    use_quantized_model_init=True,
+)
+model = NVEsmForMaskedLM(config, fp4_recipe=te_recipe.NVFP4BlockScaling())
+```
+
+### Notes
+
+- The `lm_head` (and `dense` projection in `NVEsmLMHead`) always runs in higher precision
+  (`te.autocast(enabled=False)`) regardless of `layer_precision`, to avoid numerical instability in
+  the output logits.
+- FP8 requires compute capability 9.0+ (Hopper). MXFP8 requires compute capability 10.0+
+  (Blackwell).
+- If an `fp8_recipe` is provided without `layer_precision`, all layers default to FP8. Providing
+  both `fp8_recipe` and `fp4_recipe` without `layer_precision` raises a `RuntimeError`.
+- An FP4 layer **requires** an `fp4_recipe`; omitting it raises a `RuntimeError`.
+
 ## Converting Between Model Formats
 
 This section explains how to convert between Hugging Face Transformers and Transformer Engine (TE) ESM2 model formats.
Original file line number	Diff line number	Diff line change
`@@ -142,6 +142,15 @@`
`142`	`142`	`}`
`143`	`143`	`],`
`144`	`144`	`"results": {`
	`145`	`+ "bionemo-recipes/recipes/esm2_native_te/tests/test_train.py": [`
	`146`	`+ {`
	`147`	`+ "type": "Base64 High Entropy String",`
	`148`	`+ "filename": "bionemo-recipes/recipes/esm2_native_te/tests/test_train.py",`
	`149`	`+ "hashed_secret": "76fc9c9f16bca9a436a5fcede09cc3593a4bf6f0",`
	`150`	`+ "is_verified": false,`
	`151`	`+ "line_number": 511`
	`152`	`+ }`
	`153`	`+ ],`
`145`	`154`	`"pyproject.toml": [`
`146`	`155`	`{`
`147`	`156`	`"type": "Hex High Entropy String",`
`@@ -152,5 +161,5 @@`
`152`	`161`	`}`
`153`	`162`	`]`
`154`	`163`	`},`
`155`		`- "generated_at": "2025-12-29T20:49:21Z"`
	`164`	`+ "generated_at": "2026-03-09T20:44:27Z"`
`156`	`165`	`}`