pytorch
diff --git a/‎API_GUIDE.md‎
Lines changed: 1 addition & 1 deletion b/‎API_GUIDE.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 4 additions & 4 deletions b/‎README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎contrib/kaggle/pytorch-xla-2-0-on-kaggle.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎contrib/kaggle/pytorch-xla-2-0-on-kaggle.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/learn/pytorch-on-xla-devices.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/learn/pytorch-on-xla-devices.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/perf/amp.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/perf/amp.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/perf/dynamo.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/perf/dynamo.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/perf/spmd_basic.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/perf/spmd_basic.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/dynamo/test_dynamo.py‎
Lines changed: 3 additions & 3 deletions b/‎test/dynamo/test_dynamo.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/pjrt/test_dtypes.py‎
Lines changed: 3 additions & 3 deletions b/‎test/pjrt/test_dtypes.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/scan/test_scan_layers.py‎
Lines changed: 4 additions & 4 deletions b/‎test/scan/test_scan_layers.py‎
Lines changed: 4 additions & 4 deletions
@@ -47,7 +47,7 @@ Or used with neural network modules:
 
 ```python
 l_in = torch.randn(10, device='xla')
-linear = torch.nn.Linear(10, 20).to(torch_xla.device())
+linear = torch.nn.Linear(10, 20).to('xla')
 l_out = linear(l_in)
 print(l_out)
 ```
 
@@ -158,12 +158,12 @@ To update your existing training loop, make the following changes:
    ...
 
 +  # Move the model paramters to your XLA device
-+  model.to(torch_xla.device())
++  model.to('xla')
 
    for inputs, labels in train_loader:
 +    with torch_xla.step():
 +      # Transfer data to the XLA device. This happens asynchronously.
-+      inputs, labels = inputs.to(torch_xla.device()), labels.to(torch_xla.device())
++      inputs, labels = inputs.to('xla'), labels.to('xla')
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
@@ -196,15 +196,15 @@ If you're using `DistributedDataParallel`, make the following changes:
 +  # Rank and world size are inferred from the XLA device runtime
 +  dist.init_process_group("xla", init_method='xla://')
 +
-+  model.to(torch_xla.device())
++  model.to('xla')
 +  ddp_model = DDP(model, gradient_as_bucket_view=True)
 
 -  model = model.to(rank)
 -  ddp_model = DDP(model, device_ids=[rank])
 
    for inputs, labels in train_loader:
 +    with torch_xla.step():
-+      inputs, labels = inputs.to(torch_xla.device()), labels.to(torch_xla.device())
++      inputs, labels = inputs.to('xla'), labels.to('xla')
        optimizer.zero_grad()
        outputs = ddp_model(inputs)
        loss = loss_fn(outputs, labels)
 
@@ -172,7 +172,7 @@
     "\n",
     "pipeline = DiffusionPipeline.from_pretrained(\"runwayml/stable-diffusion-v1-5\")\n",
     "# Move the model to the first TPU core\n",
-    "pipeline = pipeline.to(torch_xla.device())"
+    "pipeline = pipeline.to('xla')"
    ]
   },
   {
 
@@ -47,7 +47,7 @@ Or used with neural network modules:
 
 ``` python
 l_in = torch.randn(10, device='xla')
-linear = torch.nn.Linear(10, 20).to(torch_xla.device())
+linear = torch.nn.Linear(10, 20).to('xla')
 l_out = linear(l_in)
 print(l_out)
 ```
 
@@ -19,7 +19,7 @@ from torch_xla.amp import syncfree
 import torch_xla.core.xla_model as xm
 
 # Creates model and optimizer in default precision
-model = Net().to(torch_xla.device())
+model = Net().to('xla')
 # Pytorch/XLA provides sync-free optimizers for improved performance
 optimizer = syncfree.SGD(model.parameters(), ...)
 
@@ -106,7 +106,7 @@ from torch_xla.amp import syncfree
 import torch_xla.core.xla_model as xm
 
 # Creates model and optimizer in default precision
-model = Net().to(torch_xla.device())
+model = Net().to('xla')
 # Pytorch/XLA provides sync-free optimizers for improved performance
 optimizer = syncfree.SGD(model.parameters(), ...)
 scaler = GradScaler()
 
@@ -23,8 +23,8 @@ import torch
 import torch_xla.core.xla_model as xm
 
 def add(a, b):
-  a_xla = a.to(torch_xla.device())
-  b_xla = b.to(torch_xla.device())
+  a_xla = a.to('xla')
+  b_xla = b.to('xla')
   return a_xla + b_xla
 
 compiled_code = torch.compile(add, backend='openxla')
 
@@ -41,7 +41,7 @@ mesh_shape = (num_devices, 1)
 device_ids = np.array(range(num_devices))
 mesh = Mesh(device_ids, mesh_shape, ('data', 'model'))
 
-t = torch.randn(8, 4).to(torch_xla.device())
+t = torch.randn(8, 4).to('xla')
 
 # Mesh partitioning, each device holds 1/8-th of the input
 partition_spec = ('data', 'model')
 
@@ -66,7 +66,7 @@ def test_random_op_different_result_each_run(self, backend):
     met.clear_all()
     dynamo_random_op = torch.compile(
         self.random_op, backend=backend, fullgraph=True)
-    t = torch.randn(5, 5).to(torch_xla.device())
+    t = torch.randn(5, 5).to('xla')
     dynamo_res_1 = dynamo_random_op(t)
     dynamo_res_2 = dynamo_random_op(t)
     dynamo_res_3 = dynamo_random_op(t)
@@ -783,7 +783,7 @@ def foo(x):
     optfoo = torch.compile(backend=backend)(foo)
 
     t = torch.arange(9)
-    Xt = t.to(torch_xla.device())
+    Xt = t.to('xla')
 
     expected = foo(t)
     actual = optfoo(Xt).cpu()
@@ -803,7 +803,7 @@ def foo(x):
     optfoo = torch.compile(backend=backend)(foo)
 
     t = torch.arange(10)
-    Xt = t.to(torch_xla.device())
+    Xt = t.to('xla')
 
     expected = foo(t)
     actual = optfoo(Xt)
 
@@ -11,7 +11,7 @@ class TestDtypes(parameterized.TestCase):
                             torch.bfloat16, torch.complex64)
   def test_float_round_trip(self, dtype: torch.dtype):
     t = torch.randn((3, 3), dtype=dtype)
-    xt = t.to(torch_xla.device())
+    xt = t.to('xla')
     torch.testing.assert_close(xt.cpu(), t)
 
   @parameterized.parameters(
@@ -23,12 +23,12 @@ def test_float_round_trip(self, dtype: torch.dtype):
   )
   def test_int_round_trip(self, dtype: torch.dtype):
     t = torch.randint(0, 128, (3, 3), dtype=dtype)
-    xt = t.to(torch_xla.device())
+    xt = t.to('xla')
     torch.testing.assert_close(xt.cpu(), t)
 
   def test_bool_round_trip(self):
     t = torch.randint(0, 2, (3, 3), dtype=torch.bool)
-    xt = t.to(torch_xla.device())
+    xt = t.to('xla')
     torch.testing.assert_close(xt.cpu(), t)
 
 
 
@@ -262,14 +262,14 @@ def forward_scan(
     assert checks > 0
 
   def test_heterogenous_layers(self):
-    layer1 = nn.Linear(128, 128).to(torch_xla.device())
-    layer2 = nn.Sequential(nn.Linear(128, 128).to(torch_xla.device()))
+    layer1 = nn.Linear(128, 128).to('xla')
+    layer2 = nn.Sequential(nn.Linear(128, 128).to('xla'))
     with self.assertRaisesRegex(ValueError, "mismatched keys"):
       scan_layers([layer1, layer2], torch.zeros((128,), device='xla'))
 
   def test_mismatched_shapes(self):
-    layer1 = nn.Linear(128, 128).to(torch_xla.device())
-    layer2 = nn.Linear(128, 129).to(torch_xla.device())
+    layer1 = nn.Linear(128, 128).to('xla')
+    layer2 = nn.Linear(128, 129).to('xla')
     with self.assertRaisesRegex(ValueError, "Shape mismatch"):
       scan_layers([layer1, layer2], torch.zeros((128,), device='xla'))
Original file line number	Diff line number	Diff line change
`@@ -172,7 +172,7 @@`
`172`	`172`	`"\n",`
`173`	`173`	`"pipeline = DiffusionPipeline.from_pretrained(\"runwayml/stable-diffusion-v1-5\")\n",`
`174`	`174`	`"# Move the model to the first TPU core\n",`
`175`		`- "pipeline = pipeline.to(torch_xla.device())"`
	`175`	`+ "pipeline = pipeline.to('xla')"`
`176`	`176`	`]`
`177`	`177`	`},`
`178`	`178`	`{`