ScalingIntelligence · EssamWisam · Dec 20, 2025 · Dec 20, 2025 · Dec 20, 2025 · Dec 20, 2025
diff --git a/KernelBench/changelog/constant_fill_fixes.txt b/KernelBench/changelog/constant_fill_fixes.txt
@@ -0,0 +1,52 @@
+Changelog: Constant Fill Problems Fixes
+========================================
+
+Date: 2025-12-20
+
+Fixed 3 problems that produced constant (zero) outputs regardless of input.
+
+--------------------------------------------------------------------------------
+
+1. level2/80_Gemm_Max_Subtract_GELU.py
+
+   Issue: After max(dim=1, keepdim=True), shape is (B,1). The mean along dim=1 
+          of a single-element tensor equals the value itself, so x - mean = 0.
+
+   Fix: Changed mean dimension from 1 to 0.
+        - x = x - x.mean(dim=1, keepdim=True)
+        + x = x - x.mean(dim=0, keepdim=True)
+
+   Why: Shape is (B,1), so mean(dim=0) gives scalar mean across B samples; each 
+        sample's max differs, producing non-zero deviations from batch mean.
+
+--------------------------------------------------------------------------------
+
+2. level2/83_Conv3d_GroupNorm_Min_Clamp_Dropout.py
+
+   Issue: min(x, 0.0) forces all values ≤ 0, then clamp(min=0.0) forces all 
+          values to exactly 0.
+
+   Fix: Changed min to use max_value instead of min_value; set max_value=0.5.
+        - x = torch.min(x, torch.tensor(min_value, device=x.device))
+        + x = torch.min(x, torch.tensor(max_value, device=x.device))
+        - max_value = 1.0
+        + max_value = 0.5
+
+   Why: min(x, 0.5) caps at 0.5; clamp bounds to [0,0.5], giving output in [0,0.5]
+        range which preserves Conv3d/GroupNorm variation.
+
+--------------------------------------------------------------------------------
+
+3. level2/23_Conv3d_GroupNorm_Mean.py
+
+   Issue: GroupNorm normalizes to zero mean per group (with default affine 
+          params γ=1, β=0). The global mean of zero-mean data is ~0.
+
+   Fix: Replaced mean with amax (global max pooling).
+        - x = x.mean(dim=[1, 2, 3, 4])
+        + x = x.amax(dim=[1, 2, 3, 4])
+
+   Why: After GroupNorm, mean is ~0 but max varies per input because different
+        inputs have different extreme values in the normalized distribution.
+
+--------------------------------------------------------------------------------
diff --git a/KernelBench/changelog/redundant_op_fixes.txt b/KernelBench/changelog/redundant_op_fixes.txt
@@ -0,0 +1,89 @@
+Changelog: Redundant Operation Fixes
+=====================================
+
+Date: 2025-12-20
+
+Removed 7 redundant operations that had no effect on model output.
+
+--------------------------------------------------------------------------------
+
+1. level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean.py
+
+   Issue: Second global avg pool is no-op (tensor is N×C×1×1 after first pool).
+
+   Fix: Removed second mean operation.
+        - x = torch.mean(x, dim=[2, 3], keepdim=True)  # First
+        - x = torch.mean(x, dim=[2, 3], keepdim=True)  # Second (removed)
+
+--------------------------------------------------------------------------------
+
+2. level2/95_Matmul_Add_Swish_Tanh_GELU_Hardtanh.py
+
+   Issue: Hardtanh[-1,1] after tanh→GELU is redundant (GELU of tanh output 
+          is already in approximately [-0.16, 0.84] ⊂ [-1, 1]).
+
+   Fix: Removed Hardtanh.
+        - x = torch.nn.functional.hardtanh(x, min_val=-1, max_val=1)
+
+--------------------------------------------------------------------------------
+
+3. level2/81_Gemm_Swish_Divide_Clamp_Tanh_Clamp.py
+
+   Issue: Final clamp[-1,1] after tanh is redundant (tanh already outputs [-1,1]).
+
+   Fix: Removed final clamp.
+        - x = torch.clamp(x, min=-1.0, max=1.0)
+
+--------------------------------------------------------------------------------
+
+4. level2/7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd.py
+
+   Issue: LeakyReLU after ReLU is identity (ReLU output is ≥0, LeakyReLU is 
+          identity for non-negative inputs).
+
+   Fix: Removed LeakyReLU.
+        - x = torch.nn.functional.leaky_relu(x, negative_slope=0.01)
+
+--------------------------------------------------------------------------------
+
+5. level3/36_LSTMHn.py
+
+   Issue: fc layer computes output but returns h_n (state[0]) instead, making 
+          fc dead code.
+
+   Fix: Removed fc layer from __init__ and forward.
+        - self.fc = nn.Linear(hidden_size, output_size)
+        - out = self.fc(out[:, -1, :])
+
+--------------------------------------------------------------------------------
+
+6. level3/37_LSTMCn.py
+
+   Issue: fc layer computes output but returns c_n (state[1]) instead, making 
+          fc dead code.
+
+   Fix: Removed fc layer from __init__ and forward.
+        - self.fc = nn.Linear(hidden_size, output_size)
+        - out = self.fc(out[:, -1, :])
+
+--------------------------------------------------------------------------------
+
+7. level3/49_Mamba2ReturnFinalState.py
+
+   Issue: Y_diag einsum is computed but never used (returns new_states[:, -1]).
+          L is only used to compute Y_diag, so both are dead code.
+
+   Fix: Removed dead code computing L and Y_diag.
+        - L = torch.exp(self.segsum(A_blocks))
+        - Y_diag = torch.einsum("bclhn,bcshn,bhcls,bcshp->bclhp", ...)
+
+--------------------------------------------------------------------------------
+
+TODO: Pending Name Changes (5 files)
+-------------------------------------
+[ ] level2/23_Conv3d_GroupNorm_Mean.py → 23_Conv3d_GroupNorm_Amax.py
+[ ] level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean.py → 44_ConvTranspose2d_Multiply_GlobalAvgPool_Mean.py
+[ ] level2/95_Matmul_Add_Swish_Tanh_GELU_Hardtanh.py → 95_Matmul_Add_Swish_Tanh_GELU.py
+[ ] level2/81_Gemm_Swish_Divide_Clamp_Tanh_Clamp.py → 81_Gemm_Swish_Divide_Clamp_Tanh.py
+[ ] level2/7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd.py → 7_Conv3d_ReLU_GELU_Sigmoid_BiasAdd.py
+
diff --git a/KernelBench/level2/23_Conv3d_GroupNorm_Mean.py b/KernelBench/level2/23_Conv3d_GroupNorm_Mean.py
@@ -19,7 +19,7 @@ def forward(self, x):
         """
         x = self.conv(x)
         x = self.group_norm(x)
-        x = x.mean(dim=[1, 2, 3, 4]) # Compute mean across all dimensions except batch
+        x = x.amax(dim=[1, 2, 3, 4]) # Global max pool 
         return x
 
 batch_size = 128

diff --git a/KernelBench/level2/23_Conv3d_GroupNorm_Mean_OLD.py b/KernelBench/level2/23_Conv3d_GroupNorm_Mean_OLD.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+
+class Model(nn.Module):
+    """
+    Model that performs a 3D convolution, applies Group Normalization, computes the mean
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, num_groups):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
+        self.group_norm = nn.GroupNorm(num_groups, out_channels)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, D, H, W).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, 1).
+        """
+        x = self.conv(x)
+        x = self.group_norm(x)
+        x = x.mean(dim=[1, 2, 3, 4]) # Compute mean across all dimensions except batch
+        return x
+
+batch_size = 128
+in_channels = 3
+out_channels = 24
+D, H, W = 24, 32, 32
+kernel_size = 3
+num_groups = 8
+
+def get_inputs():
+    return [torch.rand(batch_size, in_channels, D, H, W)]
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, num_groups]
+
diff --git a/KernelBench/level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean.py b/KernelBench/level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean.py
@@ -14,8 +14,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride, padding, outp
     def forward(self, x):
         x = self.conv_transpose(x)
         x = x * self.multiplier
-        x = torch.mean(x, dim=[2, 3], keepdim=True)  # First global average pooling
-        x = torch.mean(x, dim=[2, 3], keepdim=True)  # Second global average pooling
+        x = torch.mean(x, dim=[2, 3], keepdim=True)  # Global average pooling
         return x
 
 batch_size = 16

diff --git a/KernelBench/level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean_OLD.py b/KernelBench/level2/44_ConvTranspose2d_Multiply_GlobalAvgPool_GlobalAvgPool_Mean_OLD.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+class Model(nn.Module):
+    """
+    Model that performs a transposed convolution, multiplies by a scalar, applies global average pooling, 
+    another global average pooling
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, output_padding, multiplier):
+        super(Model, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, output_padding=output_padding)
+        self.multiplier = multiplier
+
+    def forward(self, x):
+        x = self.conv_transpose(x)
+        x = x * self.multiplier
+        x = torch.mean(x, dim=[2, 3], keepdim=True)  # First global average pooling
+        x = torch.mean(x, dim=[2, 3], keepdim=True)  # Second global average pooling
+        return x
+
+batch_size = 16
+in_channels = 64
+out_channels = 128
+height, width = 128, 128
+kernel_size = 3
+stride = 2
+padding = 1
+output_padding = 1
+multiplier = 0.5
+
+def get_inputs():
+    return [torch.rand(batch_size, in_channels, height, width)]
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, stride, padding, output_padding, multiplier]
+
diff --git a/KernelBench/level2/7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd.py b/KernelBench/level2/7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd.py
@@ -13,7 +13,6 @@ def __init__(self, in_channels, out_channels, kernel_size, bias_shape):
     def forward(self, x):
         x = self.conv(x)
         x = torch.relu(x)
-        x = torch.nn.functional.leaky_relu(x, negative_slope=0.01)
         x = torch.nn.functional.gelu(x)
         x = torch.sigmoid(x)
         x = x + self.bias

diff --git a/KernelBench/level2/7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd_OLD.py b/KernelBench/level2/7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd_OLD.py
@@ -0,0 +1,34 @@
+import torch
+import torch.nn as nn
+
+class Model(nn.Module):
+    """
+    Model that performs a 3D convolution, applies ReLU, LeakyReLU, GELU, Sigmoid activations, and bias in sequence.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, bias_shape):
+        super(Model, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
+        self.bias = nn.Parameter(torch.randn(bias_shape)) 
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = torch.relu(x)
+        x = torch.nn.functional.leaky_relu(x, negative_slope=0.01)
+        x = torch.nn.functional.gelu(x)
+        x = torch.sigmoid(x)
+        x = x + self.bias
+        return x
+
+batch_size = 64
+in_channels = 8
+out_channels = 32
+depth, height, width = 32, 64, 64
+kernel_size = 3
+bias_shape = (out_channels, 1, 1, 1)
+
+def get_inputs():
+    return [torch.rand(batch_size, in_channels, depth, height, width)]
+
+def get_init_inputs():
+    return [in_channels, out_channels, kernel_size, bias_shape]
+
diff --git a/KernelBench/level2/80_Gemm_Max_Subtract_GELU.py b/KernelBench/level2/80_Gemm_Max_Subtract_GELU.py
@@ -20,7 +20,7 @@ def forward(self, x):
         """
         x = self.gemm(x)
         x = torch.max(x, dim=self.max_dim, keepdim=True).values
-        x = x - x.mean(dim=1, keepdim=True)
+        x = x - x.mean(dim=0, keepdim=True)
         x = torch.nn.functional.gelu(x)
         return x
 

diff --git a/KernelBench/level2/80_Gemm_Max_Subtract_GELU_OLD.py b/KernelBench/level2/80_Gemm_Max_Subtract_GELU_OLD.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+
+class Model(nn.Module):
+    """
+    Model that performs a GEMM, followed by a max operation, subtraction, and GELU activation.
+    """
+    def __init__(self, in_features, out_features, max_dim):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features)
+        self.max_dim = max_dim
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input tensor of shape (batch_size, in_features)
+
+        Returns:
+            Output tensor of shape (batch_size, out_features)
+        """
+        x = self.gemm(x)
+        x = torch.max(x, dim=self.max_dim, keepdim=True).values
+        x = x - x.mean(dim=1, keepdim=True)
+        x = torch.nn.functional.gelu(x)
+        return x
+
+batch_size = 1024
+in_features = 8192
+out_features = 8192
+max_dim = 1
+
+def get_inputs():
+    return [torch.rand(batch_size, in_features)]
+
+def get_init_inputs():
+    return [in_features, out_features, max_dim]
+
diff --git a/KernelBench/level2/81_Gemm_Swish_Divide_Clamp_Tanh_Clamp.py b/KernelBench/level2/81_Gemm_Swish_Divide_Clamp_Tanh_Clamp.py
@@ -21,7 +21,6 @@ def forward(self, x):
         x = x / 2.0
         x = torch.clamp(x, min=-1.0, max=1.0)  # Clamp between -1 and 1
         x = torch.tanh(x)  # Tanh activation
-        x = torch.clamp(x, min=-1.0, max=1.0)  # Clamp between -1 and 1
         return x
 
 batch_size = 1024

diff --git a/KernelBench/level2/81_Gemm_Swish_Divide_Clamp_Tanh_Clamp_OLD.py b/KernelBench/level2/81_Gemm_Swish_Divide_Clamp_Tanh_Clamp_OLD.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+class Model(nn.Module):
+    """
+    Simple model that performs a gemm, swish, divide, clamp, tanh, and clamp operations.
+    """
+    def __init__(self, in_features, out_features, bias=True):
+        super(Model, self).__init__()
+        self.gemm = nn.Linear(in_features, out_features, bias=bias)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_features).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, out_features).
+        """
+        x = self.gemm(x)
+        x = x * torch.sigmoid(x)  # Swish activation
+        x = x / 2.0
+        x = torch.clamp(x, min=-1.0, max=1.0)  # Clamp between -1 and 1
+        x = torch.tanh(x)  # Tanh activation
+        x = torch.clamp(x, min=-1.0, max=1.0)  # Clamp between -1 and 1
+        return x
+
+batch_size = 1024
+in_features = 8192
+out_features = 8192
+
+def get_inputs():
+    return [torch.rand(batch_size, in_features)]
+
+def get_init_inputs():
+    return [in_features, out_features]
+
diff --git a/KernelBench/level2/83_Conv3d_GroupNorm_Min_Clamp_Dropout.py b/KernelBench/level2/83_Conv3d_GroupNorm_Min_Clamp_Dropout.py
@@ -14,7 +14,7 @@ def __init__(self, in_channels, out_channels, kernel_size, groups, min_value, ma
     def forward(self, x):
         x = self.conv(x)
         x = self.norm(x)
-        x = torch.min(x, torch.tensor(min_value, device=x.device))
+        x = torch.min(x, torch.tensor(max_value, device=x.device))
         x = torch.clamp(x, min=min_value, max=max_value)
         x = self.dropout(x)
         return x
@@ -26,7 +26,7 @@ def forward(self, x):
 kernel_size = 3
 groups = 8
 min_value = 0.0
-max_value = 1.0
+max_value = 0.5
 dropout_p = 0.2
 
 def get_inputs():