Intitalize gradient as zero instead of empty

nujoug · nujoug · commit cab076fd089f · 2026-03-09T09:02:45.000-07:00
Signed-off-by: mloh &lt;mloh@nvidia.com&gt;
diff --git a/nemo_rl/distributed/model_utils.py b/nemo_rl/distributed/model_utils.py
@@ -220,7 +220,7 @@ def backward(
         seq_size = int(vocab_parallel_logits.shape[1])
         num_chunks = (seq_size + chunk_size - 1) // chunk_size
 
-        grad_input: torch.Tensor = torch.empty_like(
+        grad_input: torch.Tensor = torch.zeros_like(
             vocab_parallel_logits, dtype=torch.float32
         )
 
@@ -334,7 +334,7 @@ def backward(
         B, S, V_local = vocab_parallel_logits.shape
         num_chunks = (int(S) + chunk_size - 1) // chunk_size
 
-        grad_input: torch.Tensor = torch.empty_like(
+        grad_input: torch.Tensor = torch.zeros_like(
             vocab_parallel_logits, dtype=torch.float32
         )
 

Original file line number	Diff line number	Diff line change
`@@ -220,7 +220,7 @@ def backward(`
`220`	`220`	`seq_size = int(vocab_parallel_logits.shape[1])`
`221`	`221`	`num_chunks = (seq_size + chunk_size - 1) // chunk_size`
`222`	`222`
`223`		`- grad_input: torch.Tensor = torch.empty_like(`
	`223`	`+ grad_input: torch.Tensor = torch.zeros_like(`
`224`	`224`	`vocab_parallel_logits, dtype=torch.float32`
`225`	`225`	`)`
`226`	`226`
`@@ -334,7 +334,7 @@ def backward(`
`334`	`334`	`B, S, V_local = vocab_parallel_logits.shape`
`335`	`335`	`num_chunks = (int(S) + chunk_size - 1) // chunk_size`
`336`	`336`
`337`		`- grad_input: torch.Tensor = torch.empty_like(`
	`337`	`+ grad_input: torch.Tensor = torch.zeros_like(`
`338`	`338`	`vocab_parallel_logits, dtype=torch.float32`
`339`	`339`	`)`
`340`	`340`