full SSR, pcc: 0.9999514168874034

ign-krishnanand · ign-krishnanand · commit 9970595021ba · 2025-09-10T13:40:13.000Z
Patch Fea3 PCC: 0.9878828566413058
cleanup
diff --git a/models/experimental/SSR/tests/test_tile_refinement.py b/models/experimental/SSR/tests/test_tile_refinement.py
@@ -248,7 +248,6 @@ def test_tile_refinement(
         tt_torch_features = tt_torch_features.permute(0, 3, 1, 2)
 
         # Compare outputs
-        print("Torch OUT: ", ref_output.shape, tt_torch_output.shape, input_shape)
         output_pass, output_pcc_message = comp_pcc(ref_output, tt_torch_output, 0.90)
         features_pass, features_pcc_message = comp_pcc(ref_features, tt_torch_features, 0.90)
 
diff --git a/models/experimental/SSR/tests/test_upsample.py b/models/experimental/SSR/tests/test_upsample.py
@@ -62,7 +62,6 @@ def test_upsample(device, scale, num_feat, batch_size, input_size):
     # Create test input
     torch_input = torch.randn(batch_size, num_feat, input_size, input_size)
     torch_output = torch_model(torch_input)
-    print("LIKEEEEEEEEEE:", torch_output.shape)
 
     # Preprocess model parameters
     parameters = preprocess_model_parameters(
@@ -82,7 +81,6 @@ def test_upsample(device, scale, num_feat, batch_size, input_size):
     tt_torch_output = tt2torch_tensor(ttnn_output)
     tt_torch_output = tt_torch_output.permute(0, 3, 1, 2)
 
-    print(torch_output.shape, tt_torch_output.shape)
     does_pass, pcc_message = comp_pcc(torch_output, tt_torch_output, 0.99)
 
     logger.info(pcc_message)
diff --git a/models/experimental/SSR/tt/CAB.py b/models/experimental/SSR/tt/CAB.py
@@ -10,16 +10,8 @@ class TTCAB(LightweightModule):
     def __init__(self, device, parameters, num_feat, compress_ratio=3, squeeze_factor=30, memory_config=None):
         super().__init__()
 
-        # Debug print for __init__ parameters
-        # print(f"[TTCAB.__init__] device={device}, num_feat={num_feat}, compress_ratio={compress_ratio}, squeeze_factor={squeeze_factor}, memory_config={memory_config}")
-        # print(f"[TTCAB.__init__] parameters keys: {list(parameters.keys())}")
-        # print(f"[TTCAB.__init__] conv1 keys: {list(parameters['conv1'].keys()) if 'conv1' in parameters else 'N/A'}")
-        # print(f"[TTCAB.__init__] conv2 keys: {list(parameters['conv2'].keys()) if 'conv2' in parameters else 'N/A'}")
-        # print(f"[TTCAB.__init__] channel_attention keys: {list(parameters['channel_attention'].keys()) if 'channel_attention' in parameters else 'N/A'}")
-
         self.device = device
         self.memory_config = ttnn.L1_MEMORY_CONFIG
-        # self.memory_config = memory_config or ttnn.DRAM_MEMORY_CONFIG
         self.num_feat = num_feat
         self.compress_ratio = compress_ratio
         self.squeeze_factor = squeeze_factor
@@ -40,9 +32,6 @@ def __init__(self, device, parameters, num_feat, compress_ratio=3, squeeze_facto
         )
 
     def forward(self, x):
-        # Debug print for forward input
-        # print(f"[TTCAB.forward] x.shape={x.shape}")
-
         # Store original input shape for convolutions
         batch_size, height, width, channels = x.shape
         conv_config = ttnn.Conv2dConfig(
@@ -79,9 +68,6 @@ def forward(self, x):
         # Reshape from flattened conv output back to spatial format
         x = ttnn.reshape(x, [batch_size, height, width, self.num_feat // self.compress_ratio])
 
-        # # GELU activation
-        # x = ttnn.gelu(x)
-
         conv_config = ttnn.Conv2dConfig(
             weights_dtype=ttnn.bfloat16,
             shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
diff --git a/models/experimental/SSR/tt/HAB.py b/models/experimental/SSR/tt/HAB.py
@@ -63,7 +63,6 @@ def __init__(
         )
 
     def forward(self, x, x_size, rpi_sa, attn_mask):
-        print("HAB")
         h, w = x_size
         b, seq_len, c = x.shape
         if x.memory_config().buffer_type != ttnn.BufferType.L1:
diff --git a/models/experimental/SSR/tt/RHAG.py b/models/experimental/SSR/tt/RHAG.py
@@ -159,11 +159,9 @@ def forward(self, x, x_size, params):
 
         # Pass through residual group (AttenBlocks)
         x = self.residual_group(x, x_size, params)
-        # return x
 
         # Patch unembed: convert from sequence to spatial format
         x = self.patch_unembed(x, x_size)
-        # return x
 
         # Apply convolutional layer
         if self.resi_connection == "1conv":
@@ -194,23 +192,10 @@ def forward(self, x, x_size, params):
         elif self.resi_connection == "identity":
             x = ttnn.permute(x, (0, 2, 3, 1))  # (batch_size, embed_dim, num_patches)
 
-        #     x = ttnn.reshape(x, (x.shape[0], self.input_resolution[0], self.input_resolution[1], self.dim))
-        # Identity - no operation needed
-        # pass
-        # return x
-
         # Patch embed: convert back to sequence format
         x = self.patch_embed(x)
-        # import pdb
-
-        # pdb.set_trace()
 
         x = ttnn.reshape(x, (x.shape[0], self.input_resolution[0] * self.input_resolution[1], self.dim))
-        # return x
-
-        # import pdb
-
-        # pdb.set_trace()
 
         # Add residual connection
         x = ttnn.add(x, shortcut)
diff --git a/models/experimental/SSR/tt/channel_attention.py b/models/experimental/SSR/tt/channel_attention.py
@@ -41,48 +41,20 @@ def forward(self, x):
                 ends=[original_shape[0], 1, 1, 180],  # End indices - slice to 180 in last dim
                 steps=[1, 1, 1, 1],  # Step size for each dimension
             )
-        # TODO: find ways to generalise for all inputs, setting program config messes up the multi batch runs..
-        # Matrix multiplication 1:  [1, 180] @ [180, 6] = [1, 6]
-        # program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig(
-        #     compute_with_storage_grid_size=(10, 1),  # or (5, 2) to accommodate 10 batches
-        #     in0_block_w=6,  # Keep same as your calculation
-        #     out_subblock_h=1,
-        #     out_subblock_w=1,
-        #     per_core_M=1,  # Each core handles 1 batch worth of M dimension
-        #     per_core_N=1,
-        #     fuse_batch=True,
-        #     fused_activation=None,
-        #     mcast_in0=False,
-        # )
+
         x = ttnn.linear(
             x,
             self.conv1_weight,
             bias=self.conv1_bias,
             memory_config=self.memory_config,
-            # program_config=program_config,
             activation="relu",
-            # compute_kernel_config=compute_kernel_config,  # set to HiFi2 to improve accuracy
         )
 
-        # Matrix multiplication 2:  [1, 6] @ [6, 180] = [1, 180]
-        # program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig(
-        #     compute_with_storage_grid_size=(6, 6),
-        #     in0_block_w=1, # 32(6 padded) / 32(1 tile size) = 1
-        #     out_subblock_h=1,
-        #     out_subblock_w=1,
-        #     per_core_M=1,
-        #     per_core_N=1,
-        #     fuse_batch=True,
-        #     fused_activation=None,
-        #     mcast_in0=False,
-        # )
         x = ttnn.linear(
             x,
             self.conv2_weight,
             bias=self.conv2_bias,
             memory_config=self.memory_config,
-            # program_config=program_config,
-            # compute_kernel_config=compute_kernel_config,  # set to HiFi2 to improve accuracy
         )
 
         # Sigmoid activation
diff --git a/models/experimental/SSR/tt/mask_token_inference.py b/models/experimental/SSR/tt/mask_token_inference.py
@@ -8,34 +8,6 @@
 
 
 class TTMaskTokenInference(LightweightModule):
-    # def __init__(
-    #     self, device, parameters, dim, num_heads=1, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0
-    # ):
-    #     self.device = device
-    #     self.dim = dim
-    #     self.num_heads = num_heads
-    #     self.head_dim = dim // num_heads
-    #     self.scale = qk_scale or (self.head_dim**-0.5)
-
-    #     # Layer norm parameters (would need to be loaded from state dict)
-    #     self.norm_weight = parameters["norm"]["weight"]  # ttnn tensor for layer norm weight
-    #     self.norm_bias = parameters["norm"]["bias"]  # ttnn tensor for layer norm bias
-
-    #     # Linear layer weights (would need to be preprocessed and loaded)
-    #     self.q_weight = parameters["q"]["weight"]  # ttnn tensor for query projection
-    #     self.k_weight = parameters["k"]["weight"]  # ttnn tensor for key projection
-    #     self.v_weight = parameters["v"]["weight"]  # ttnn tensor for value projection
-    #     self.proj_weight = parameters["proj"]["weight"]  # ttnn tensor for output projection
-
-    #     self.q_bias = parameters["q"]["bias"] if qkv_bias else None
-    #     self.k_bias = parameters["k"]["bias"] if qkv_bias else None
-    #     self.v_bias = parameters["v"]["bias"] if qkv_bias else None
-    #     self.proj_bias = parameters["proj"]["bias"]
-
-    #     # Scale tensor
-    #     scale_tensor = torch.tensor(self.scale).view(1, 1, 1, 1)
-    #     self.tt_scale = ttnn.from_torch(scale_tensor, dtype=ttnn.bfloat16, device=device, layout=ttnn.TILE_LAYOUT)
-
     def __init__(
         self, device, parameters, dim, num_heads=1, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0
     ):
@@ -61,71 +33,6 @@ def __init__(
         scale_tensor = torch.tensor(self.scale).view(1, 1, 1, 1)
         self.tt_scale = ttnn.from_torch(scale_tensor, dtype=ttnn.bfloat16, device=device, layout=ttnn.TILE_LAYOUT)
 
-    # def __call__(self, fea):
-    #     B, N, C = fea.shape
-
-    #     # Layer normalization
-    #     x = ttnn.layer_norm(fea, weight=self.norm_weight, bias=self.norm_bias, memory_config=ttnn.L1_MEMORY_CONFIG)
-    #     fea_skip = fea
-    #     fea_skip = ttnn.reallocate(fea_skip, memory_config=ttnn.DRAM_MEMORY_CONFIG)
-    #     ttnn.deallocate(fea)
-
-    #     # Split into classification token and feature tokens
-    #     # T_s: classification token [B, 1, C]
-    #     # F_s: feature tokens [B, N-1, C]
-    #     T_s = ttnn.slice(x, [0, 0, 0], [B, 1, C])
-    #     F_s = ttnn.slice(x, [0, 1, 0], [B, N, C])
-    #     ttnn.deallocate(x)
-
-    #     # Query from feature tokens
-    #     q = ttnn.linear(F_s, self.q_weight, bias=self.q_bias, memory_config=ttnn.L1_MEMORY_CONFIG)
-    #     # q = ttnn.reshape(q, (B, N - 1, self.num_heads, self.head_dim))
-    #     # q = ttnn.permute(q, (0, 2, 1, 3))
-
-    #     # Key from classification token
-    #     k = ttnn.linear(T_s, self.k_weight, bias=self.k_bias, memory_config=ttnn.L1_MEMORY_CONFIG)
-    #     # k = ttnn.reshape(k, (B, 1, self.num_heads, self.head_dim))
-    #     # k = ttnn.permute(k, (0, 2, 1, 3))
-
-    #     # Value from classification token
-    #     v = ttnn.linear(T_s, self.v_weight, bias=self.v_bias, memory_config=ttnn.L1_MEMORY_CONFIG)
-    #     # v = ttnn.reshape(v, (B, 1, self.num_heads, self.head_dim))
-    #     # v = ttnn.permute(v, (0, 2, 1, 3))
-
-    #     # Attention computation: q @ k.T
-    #     k_transposed = ttnn.transpose(k, -2, -1)
-    #     attn = ttnn.matmul(q, k_transposed)
-
-    #     # Scale attention scores
-    #     attn = ttnn.multiply(attn, self.tt_scale)
-
-    #     # Apply sigmoid instead of softmax
-    #     attn = ttnn.sigmoid(attn)
-
-    #     # Apply attention dropout (if needed, would require custom implementation)
-    #     # attn = apply_dropout(attn, attn_drop)
-
-    #     # Compute attention output
-    #     infer_fea = ttnn.matmul(attn, v)
-
-    #     # Reshape back to [B, N-1, C]
-    #     infer_fea = ttnn.permute(infer_fea, (0, 2, 1, 3))
-    #     infer_fea = ttnn.to_layout(infer_fea, layout=ttnn.ROW_MAJOR_LAYOUT)
-    #     infer_fea = ttnn.reshape(infer_fea, (B, N - 1, C))
-    #     infer_fea = ttnn.to_layout(infer_fea, layout=ttnn.TILE_LAYOUT)
-
-    #     # Output projection
-    #     infer_fea = ttnn.linear(infer_fea, self.proj_weight, bias=self.proj_bias)
-
-    #     # Apply projection dropout (if needed)
-    #     # infer_fea = apply_dropout(infer_fea, proj_drop)
-
-    #     # Residual connection with original feature tokens
-    #     original_features = ttnn.slice(fea_skip, [0, 1, 0], [B, N, C])
-    #     infer_fea = ttnn.add(infer_fea, original_features)
-
-    #     return infer_fea
-
     def __call__(self, fea):
         B, N, C = fea.shape
 
diff --git a/models/experimental/SSR/tt/mlp.py b/models/experimental/SSR/tt/mlp.py
@@ -18,13 +18,8 @@ def __init__(self, device, in_features, hidden_features=None, out_features=None,
         self.fc2_bias = parameters["fc2"]["bias"]
 
     def forward(self, x):
-        # Debug prints for forward arguments
         if x.memory_config().buffer_type != ttnn.BufferType.L1:
             x = ttnn.to_memory_config(x, ttnn.L1_MEMORY_CONFIG)
-        # First linear layer
-        # program_config = matmul_config(
-        #     x.shape[-2], x.shape[-1], self.fc1_bias.shape[-1], (8, 8), fused_activation=(ttnn.UnaryOpType.GELU, True)
-        # )
         x = ttnn.linear(
             x,
             self.fc1_weight,
@@ -34,8 +29,6 @@ def forward(self, x):
             activation="gelu",
         )
 
-        # program_config = matmul_config(x.shape[-2], x.shape[-1], self.fc2_bias.shape[-1], (8, 8))
-        # Second linear layer
         x = ttnn.linear(
             x,
             self.fc2_weight,
diff --git a/models/experimental/SSR/tt/patch_embed_tile_refinement.py b/models/experimental/SSR/tt/patch_embed_tile_refinement.py
@@ -59,15 +59,9 @@ def forward(self, x):
         Returns:
             Output tensor of shape [batch, num_patches, embed_dim]
         """
-        # batch_size, channels, height_width = x.shape
 
-        # Flatten spatial dimensions and transpose
-        # x.flatten(2) -> [batch, channels, height*width]
-        # .transpose(1, 2) -> [batch, height*width, channels]
         if x.is_sharded():
             x = ttnn.to_memory_config(x, ttnn.DRAM_MEMORY_CONFIG)
-        # x = ttnn.reshape(x, (batch_size, channels, height_width))
-        # x = ttnn.transpose(x, 1, 2)  # [batch, height*width, channels]
 
         # Apply normalization if available
         if self.norm_weight is not None:
diff --git a/models/experimental/SSR/tt/tile_refinement.py b/models/experimental/SSR/tt/tile_refinement.py
@@ -145,14 +145,6 @@ def forward_features(self, x):
         """Forward pass through transformer layers"""
         x_size = (self.h, self.w)
 
-        # # Calculate attention mask
-        # attn_mask = self.calculate_mask(x_size)
-        # # params = {
-        # #     "attn_mask": attn_mask,
-        # #     "rpi_sa": self.parameters.relative_position_index_SA,
-        # #     "rpi_oca": self.parameters.relative_position_index_OCA,
-        # # }
-
         # Patch embedding
         x = self.patch_embed(x)
 
@@ -171,7 +163,6 @@ def forward_features(self, x):
             bias=self.parameters.norm.bias,
             memory_config=self.memory_config,
         )
-        # return x
 
         # Patch unembedding
         x = self.patch_unembed(x, x_size)
@@ -265,7 +256,6 @@ def forward(self, x):
         self.mean = ttnn.to_layout(self.mean, ttnn.TILE_LAYOUT)
         x = ttnn.subtract(x, self.mean, memory_config=self.memory_config)
         x = ttnn.multiply(x, self.img_range, memory_config=self.memory_config)
-        # return x
 
         if self.upsampler == "pixelshuffle":
             # Shallow feature extraction
@@ -298,8 +288,6 @@ def forward(self, x):
                 batch_size=x.shape[0],
                 input_height=x.shape[1],
                 input_width=x.shape[2],
-                # dilation= [1, 1],
-                # groups = 1,
                 conv_config=self.conv_config,
                 compute_config=self.compute_config,
                 return_output_dim=False,
@@ -338,7 +326,6 @@ def forward(self, x):
                 batch_size=fea.shape[0],
                 input_height=fea.shape[1],
                 input_width=fea.shape[2],
-                # dilation= [1, 1],
                 conv_config=self.conv_afterbody_config,
                 compute_config=self.compute_config,
                 return_output_dim=False,
@@ -375,8 +362,6 @@ def forward(self, x):
             x = ttnn.leaky_relu(x, negative_slope=0.01, memory_config=ttnn.DRAM_MEMORY_CONFIG)
             x = ttnn.reshape(x, [batch_size, 64, 64, 64])  # TODO
 
-            # x = ttnn.permute(x, (0, 3, 1, 2))
-
             # Upsampling
             x = self.upsample(x, self.parameters["upsample"])
 
diff --git a/models/experimental/SSR/tt/upsample.py b/models/experimental/SSR/tt/upsample.py
diff --git a/models/experimental/SSR/tt/window_attn_tr.py b/models/experimental/SSR/tt/window_attn_tr.py

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,6 @@ def __init__(`
`63`	`63`	`)`
`64`	`64`
`65`	`65`	`def forward(self, x, x_size, rpi_sa, attn_mask):`
`66`		`- print("HAB")`
`67`	`66`	`h, w = x_size`
`68`	`67`	`b, seq_len, c = x.shape`
`69`	`68`	`if x.memory_config().buffer_type != ttnn.BufferType.L1:`