Remove explicit device arguments

navsud · facebook-github-bot · commit 9c5ccc2b31dc · 2025-09-25T21:01:26.000-07:00
Summary:
As part of enabling QAT for HTP model, we need to run QAT on the model that we use during export. For that, having hardcoded device type to "cpu" needs a lot of model changes to move them to "cuda". Simpler solution is to remove the device type and let the device type be auto-inferred during export.

For training time, we anyway build the model with the context `with torch.device("cuda"):` which takes care of it.

Update: This was failing multiple export tests, as the Llama2Model (at llama/model.py) was instantiating the transformer with "meta" device, which needed the rope params to be explicitly instantiated on "cpu" device. Changed "meta" to "cpu" to fix this issue.

Differential Revision: D82239525
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -429,7 +429,6 @@ def __init__(
                 self.max_context_len,
                 self.max_context_len,
                 dtype=torch.bool,
-                device="cpu",
             )
         )
         self.register_buffer("mask", causal_mask, persistent=False)
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -195,7 +195,7 @@ def __init__(self, llm_config: Optional[LlmConfig] = None):
 
         # Within the device="meta" context, tensors that are created do not carry data.
         # They possess all other metadata a tensor carries such as size, stride, requires_grad.
-        with torch.device("meta"):
+        with torch.device("cpu"):
             # Model itself is loaded in default dtype, fp32.
             self.model_ = construct_transformer(model_args)
             # Get checkpoint dtype.
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
@@ -47,12 +47,11 @@ def precompute_freqs_cis(
     use_scaled: bool = False,
     scale_factor: Optional[int] = None,
     high_freq_factor: int = 4,
-    device: Union[str, torch.device] = "cpu",
 ):
     freqs = 1.0 / (
-        theta ** (torch.arange(0, dim, 2, device=device)[: (dim // 2)].float() / dim)
+        theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
     )
-    t = torch.arange(end, device=freqs.device)  # pyre-ignore
+    t = torch.arange(end)
     if use_scaled:
         assert scale_factor is not None
         freqs = apply_scaling(freqs, scale_factor, high_freq_factor)  # pyre-ignore

Original file line number	Diff line number	Diff line change
`@@ -429,7 +429,6 @@ def __init__(`
`429`	`429`	`self.max_context_len,`
`430`	`430`	`self.max_context_len,`
`431`	`431`	`dtype=torch.bool,`
`432`		`- device="cpu",`
`433`	`432`	`)`
`434`	`433`	`)`
`435`	`434`	`self.register_buffer("mask", causal_mask, persistent=False)`