parameterize axial positional embeddings as MLPs for height and width, for testing image size extrapolation

lucidrains · lucidrains · commit b323532e4046 · 2023-03-05T11:06:41.000-08:00
diff --git a/README.md b/README.md
@@ -85,7 +85,7 @@ diffusion = GaussianDiffusion(
     scale = 1.
 )
 
-training_images = torch.randn(8, 3, 128, 128) # images are normalized from 0 to 1
+training_images = torch.randn(8, 3, 128, 128).cuda() # images are normalized from 0 to 1
 loss = diffusion(training_images)
 loss.backward()
 # after a lot of training
diff --git a/rin_pytorch/rin_pytorch.py b/rin_pytorch/rin_pytorch.py
@@ -389,7 +389,29 @@ def __init__(
             nn.LayerNorm(dim) if dual_patchnorm else None,
         )
 
-        self.axial_pos_emb = nn.Parameter(torch.randn(2, patch_height_width, dim) * 0.02)
+        # axial positional embeddings, parameterized by an MLP
+
+        pos_emb_dim = dim // 2
+
+        self.axial_pos_emb_height_mlp = nn.Sequential(
+            Rearrange('... -> ... 1'),
+            nn.Linear(1, pos_emb_dim),
+            nn.SiLU(),
+            nn.Linear(pos_emb_dim, pos_emb_dim),
+            nn.SiLU(),
+            nn.Linear(pos_emb_dim, dim)
+        )
+
+        self.axial_pos_emb_width_mlp = nn.Sequential(
+            Rearrange('... -> ... 1'),
+            nn.Linear(1, pos_emb_dim),
+            nn.SiLU(),
+            nn.Linear(pos_emb_dim, pos_emb_dim),
+            nn.SiLU(),
+            nn.Linear(pos_emb_dim, dim)
+        )
+
+        # nn.Parameter(torch.randn(2, patch_height_width, dim) * 0.02)
 
         self.to_pixels = nn.Sequential(
             LayerNorm(dim),
@@ -414,6 +436,10 @@ def __init__(
 
         self.blocks = nn.ModuleList([RINBlock(dim, dim_latent = dim_latent, latent_self_attn_depth = latent_self_attn_depth, **attn_kwargs) for _ in range(depth)])
 
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
     def forward(
         self,
         x,
@@ -451,7 +477,9 @@ def forward(
 
         patches = self.to_patches(x)
 
-        pos_emb_h, pos_emb_w = self.axial_pos_emb
+        height_range = width_range = torch.linspace(0., 1., steps = int(math.sqrt(patches.shape[-2])), device = self.device)
+        pos_emb_h, pos_emb_w = self.axial_pos_emb_height_mlp(height_range), self.axial_pos_emb_width_mlp(width_range)
+
         pos_emb = rearrange(pos_emb_h, 'i d -> i 1 d') + rearrange(pos_emb_w, 'j d -> 1 j d')
         patches = patches + rearrange(pos_emb, 'i j d -> (i j) d')
 
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'RIN-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.5.4',
+  version = '0.6.0',
   license='MIT',
   description = 'RIN - Recurrent Interface Network - Pytorch',
   author = 'Phil Wang',

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ diffusion = GaussianDiffusion(`
`85`	`85`	`scale = 1.`
`86`	`86`	`)`
`87`	`87`
`88`		`-training_images = torch.randn(8, 3, 128, 128) # images are normalized from 0 to 1`
	`88`	`+training_images = torch.randn(8, 3, 128, 128).cuda() # images are normalized from 0 to 1`
`89`	`89`	`loss = diffusion(training_images)`
`90`	`90`	`loss.backward()`
`91`	`91`	`# after a lot of training`