ScalingIntelligence · bkal01 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 8, 2026
diff --git a/KernelBench/level1/100_HingeLoss.py b/KernelBench/level1/100_HingeLoss.py
@@ -1,6 +1,8 @@
 import torch
 import torch.nn as nn
 
+from torch.distributions import Pareto
+
 class Model(nn.Module):
     """
     A model that computes Hinge Loss for binary classification tasks.
@@ -19,7 +21,9 @@ def forward(self, predictions, targets):
 dim = 1
 
 def get_inputs():
-    return [torch.rand(batch_size, *input_shape), torch.randint(0, 2, (batch_size,)).float() * 2 - 1]
+    predictions = Pareto(0.01, 1.5).sample((batch_size, *input_shape))
+    targets = torch.randint(0, 2, (batch_size,)).float() * 2 - 1
+    return [predictions, targets]
 
 def get_init_inputs():
     return []
diff --git a/KernelBench/level1/96_HuberLoss.py b/KernelBench/level1/96_HuberLoss.py
@@ -1,6 +1,8 @@
 import torch
 import torch.nn as nn
 
+from torch.distributions import Pareto
+
 class Model(nn.Module):
     """
     A model that computes Smooth L1 (Huber) Loss for regression tasks.
@@ -20,7 +22,9 @@ def forward(self, predictions, targets):
 
 def get_inputs():
     scale = torch.rand(())
-    return [torch.rand(batch_size, *input_shape)*scale, torch.rand(batch_size, *input_shape)]
+    predictions = Pareto(0.01, 1.5).sample((batch_size, *input_shape))
+    targets = Pareto(0.01, 1.5).sample((batch_size, *input_shape))
+    return [predictions*scale, targets]
 
 def get_init_inputs():
     return []
diff --git a/scripts/verify_bench.py b/scripts/verify_bench.py
@@ -3,17 +3,20 @@
 and random initialization. It compares the output of the original model against itself.
 It ensures that the test is well-formed and there are no sources of non-determinism in the test.
 
-Usage: python test_bench.py
+Usage: 
+    python verify_bench.py                              # Run all levels
+    python verify_bench.py level=1                      # Run only level 1
+    python verify_bench.py problem_ids=[96,100]         # Run only problem IDs 96 and 100
 """
 
-import importlib
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
+import importlib.util
+import os
 import random
+
 import numpy as np
-import os
-import importlib.util
+import pydra
+from pydra import Config
+import torch
 
 """
 Test all the reference architectures compiles 
@@ -37,13 +40,21 @@ def set_seed(seed):
 
 
 def check_correctness(
-    Model, NewModel, get_inputs, get_init_inputs, seed=1012, atol=1e-02, rtol=1e-02
+    Model, NewModel, get_inputs, get_init_inputs, seed=42, atol=None, rtol=None, precision=None
 ):
+    if atol is None:
+        atol = get_tolerance_for_precision(precision)
+    if rtol is None:
+        rtol = get_tolerance_for_precision(precision)
     # run the model and check correctness
     with torch.no_grad():
         set_seed(seed)
         inputs = get_inputs()
-        inputs = [x.cuda() if isinstance(x, torch.Tensor) else x for x in inputs]
+        inputs = [x.cuda().to(precision) if isinstance(x, torch.Tensor) else x for x in inputs]
+
+        for i, x in enumerate(inputs):
+            if isinstance(x, torch.Tensor) and torch.isinf(x).any():
+                raise ValueError(f"Input {i} contains infinity values")
 
         set_seed(seed)
         init_inputs = get_init_inputs()
@@ -52,10 +63,10 @@ def check_correctness(
         ]
 
         set_seed(seed)
-        model = Model(*init_inputs).cuda()
+        model = Model(*init_inputs).cuda().to(precision)
 
         set_seed(seed)
-        model_new = NewModel(*init_inputs).cuda()
+        model_new = NewModel(*init_inputs).cuda().to(precision)
 
         output = model(*inputs)
         output_new = model_new(*inputs)
@@ -67,22 +78,46 @@ def check_correctness(
     return True
 
 
-def run(Model, NewModel, get_inputs, get_init_inputs, seed=1012):
-    return check_correctness(Model, NewModel, get_inputs, get_init_inputs, seed)
+def run(Model, NewModel, get_inputs, get_init_inputs, seed=1012, precision=None):
+    return check_correctness(Model, NewModel, get_inputs, get_init_inputs, seed, precision=precision)
 
 
 from kernelbench.dataset import construct_kernelbench_dataset
+from kernelbench.eval import get_torch_dtype_from_string, get_tolerance_for_precision
+
+
+class ScriptConfig(Config):
+    def __init__(self):
+        # Level(s) to run - can be single int or list
+        self.level = [1, 2, 3]
+        # Filter by problem IDs (e.g., [96, 100])
+        self.problem_ids = []
+        # Dataset source
+        self.source = "local"
+        # Precision: "fp32", "fp16", "bf16"
+        self.precision = "fp32"
 
-def run_all(level):
-    print(f"Running Level {level}")
-    dataset = construct_kernelbench_dataset(level)
+
+def run_all(level: int, problem_ids: list, source: str, precision: torch.dtype):
+    """
+    Run all problems in the given level.
+    """
+
+    print(f"Running Level {level} of length {len(problem_ids)} problems from {source} with precision {precision}")
+
+    # Use problem_ids filtering at dataset level if specified
+    if problem_ids:
+        dataset = construct_kernelbench_dataset(level, source=source, problem_ids=problem_ids)
+    else:
+        dataset = construct_kernelbench_dataset(level, source=source)
+
     total = 0
     passed = 0
     fail_tests = []
 
     for problem in dataset:
-        total += 1
         module_name = problem.name.replace(".py", "")
+        total += 1
         try:
             problem_path = getattr(problem, "path", None)
             if not problem_path:
@@ -100,8 +135,9 @@ def run_all(level):
             Model = getattr(module, "Model")
             get_inputs = getattr(module, "get_inputs")
             get_init_inputs = getattr(module, "get_init_inputs")
-            assert run(Model, Model, get_inputs, get_init_inputs)
+            assert run(Model, Model, get_inputs, get_init_inputs, precision=precision)
             passed += 1
+            print(f"Passed {module_name}")
         except Exception as e:
             print(f"Failed {module_name}: {e}")
             fail_tests.append(module_name)
@@ -110,7 +146,15 @@ def run_all(level):
         print(f"Failed tests: {fail_tests}")
 
 
+@pydra.main(base=ScriptConfig)
+def main(config: ScriptConfig):
+    levels = config.level if isinstance(config.level, list) else [config.level]
+    problem_ids = config.problem_ids if config.problem_ids else []
+    precision = get_torch_dtype_from_string(config.precision)
+
+    for level in levels:
+        run_all(level, problem_ids, config.source, precision)
+
+
 if __name__ == "__main__":
-    run_all(1)
-    run_all(2)
-    run_all(3)
+    main()
diff --git a/src/kernelbench/tests/problems/100_HingeLoss_NEW.py b/src/kernelbench/tests/problems/100_HingeLoss_NEW.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+
+from torch.distributions import Normal
+
+class Model(nn.Module):
+    """
+    A model that computes Hinge Loss for binary classification tasks.
+
+    Parameters:
+        None
+    """
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        return torch.mean(torch.clamp(1 - predictions * targets, min=0))
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+def get_inputs():
+    m, s = torch.rand(()), torch.rand(()) + 0.1
+    predictions = Normal(m, s).sample((batch_size, *input_shape))
+    targets = torch.randint(0, 2, (batch_size,)).float() * 2 - 1
+    return [predictions, targets]
+
+def get_init_inputs():
+    return []
diff --git a/src/kernelbench/tests/problems/100_HingeLoss_OLD.py b/src/kernelbench/tests/problems/100_HingeLoss_OLD.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+
+class Model(nn.Module):
+    """
+    A model that computes Hinge Loss for binary classification tasks.
+
+    Parameters:
+        None
+    """
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        return torch.mean(torch.clamp(1 - predictions * targets, min=0))
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+def get_inputs():
+    return [torch.rand(batch_size, *input_shape), torch.randint(0, 2, (batch_size,)).float() * 2 - 1]
+
+def get_init_inputs():
+    return []
diff --git a/src/kernelbench/tests/problems/94_MSELoss_NEW.py b/src/kernelbench/tests/problems/94_MSELoss_NEW.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+from torch.distributions import Normal
+
+class Model(nn.Module):
+    """
+    A model that computes the Mean Squared Error loss for regression tasks.
+
+    Parameters:
+        None
+    """
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        return torch.mean((predictions - targets) ** 2)
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+def get_inputs():
+    scale = torch.rand(())
+    m1, m2 = torch.rand(2)
+    s1, s2 = torch.rand(2) + 0.1
+    predictions = Normal(m1, s1).sample((batch_size, *input_shape))
+    targets = Normal(m2, s2).sample((batch_size, *input_shape))
+    return [predictions*scale, targets]
+
+def get_init_inputs():
+    return []
diff --git a/src/kernelbench/tests/problems/94_MSELoss_OLD.py b/src/kernelbench/tests/problems/94_MSELoss_OLD.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+class Model(nn.Module):
+    """
+    A model that computes the Mean Squared Error loss for regression tasks.
+
+    Parameters:
+        None
+    """
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        return torch.mean((predictions - targets) ** 2)
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+def get_inputs():
+    scale = torch.rand(())
+    return [torch.rand(batch_size, *input_shape)*scale, torch.rand(batch_size, *input_shape)]
+
+def get_init_inputs():
+    return []
diff --git a/src/kernelbench/tests/problems/96_HuberLoss_NEW.py b/src/kernelbench/tests/problems/96_HuberLoss_NEW.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+
+from torch.distributions import Normal
+
+class Model(nn.Module):
+    """
+    A model that computes Smooth L1 (Huber) Loss for regression tasks.
+
+    Parameters:
+        None
+    """
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        return torch.nn.functional.smooth_l1_loss(predictions, targets)
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+def get_inputs():
+    scale = torch.rand(())
+    m1, m2 = torch.rand(2)
+    s1, s2 = torch.rand(2) + 0.1
+    predictions = Normal(m1, s1).sample((batch_size, *input_shape))
+    targets = Normal(m2, s2).sample((batch_size, *input_shape))
+    return [predictions*scale, targets]
+
+def get_init_inputs():
+    return []
diff --git a/src/kernelbench/tests/problems/96_HuberLoss_OLD.py b/src/kernelbench/tests/problems/96_HuberLoss_OLD.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+class Model(nn.Module):
+    """
+    A model that computes Smooth L1 (Huber) Loss for regression tasks.
+
+    Parameters:
+        None
+    """
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, predictions, targets):
+        return torch.nn.functional.smooth_l1_loss(predictions, targets)
+
+batch_size = 32768
+input_shape = (32768,)
+dim = 1
+
+def get_inputs():
+    scale = torch.rand(())
+    return [torch.rand(batch_size, *input_shape)*scale, torch.rand(batch_size, *input_shape)]
+
+def get_init_inputs():
+    return []