WecoAI
diff --git a/‎README.md‎
Lines changed: 45 additions & 8 deletions b/‎README.md‎
Lines changed: 45 additions & 8 deletions
diff --git a/‎examples/cuda/evaluate.py‎
Lines changed: 157 additions & 0 deletions b/‎examples/cuda/evaluate.py‎
Lines changed: 157 additions & 0 deletions
diff --git a/‎examples/cuda/guide.md‎
Lines changed: 113 additions & 0 deletions b/‎examples/cuda/guide.md‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎examples/cuda/optimize.py‎
Lines changed: 44 additions & 0 deletions b/‎examples/cuda/optimize.py‎
Lines changed: 44 additions & 0 deletions
@@ -77,32 +77,69 @@ Here's how `weco` can be applied to common ML engineering tasks:
 
 ### Examples
 
-**Example 1: Optimizing PyTorch operations**
+**Example 1: Optimizing PyTorch simple operations**
 
 ```bash
-weco --source examples/simple-torch/optimize.py \
-     --eval-command "python examples/simple-torch/evaluate.py --solution-path examples/simple-torch/optimize.py --device mps" \
+cd examples/hello-kernel-world
+pip install torch 
+weco --source optimize.py \
+     --eval-command "python evaluate.py --solution-path optimize.py --device cpu" \
      --metric speedup \
      --maximize true \
      --steps 15 \
-     --model o3-mini \
+     --model claude-3-7-sonnet-20250219 \
      --additional-instructions "Fuse operations in the forward method while ensuring the max float deviation remains small. Maintain the same format of the code."
 ```
 
+Note that if you have an NVIDIA gpu, change the device to `cuda`. If you are running this on Apple Silicon, set it to `mps`.
+
 **Example 2: Optimizing MLX operations with instructions from a file**
 
-Sometimes, additional context or instructions are too complex for a single command-line string. You can provide a path to a file containing these instructions.
+Lets optimize a 2D convolution operation in [`mlx`](https://github.com/ml-explore/mlx) using [Metal](https://developer.apple.com/documentation/metal/). Sometimes, additional context or instructions are too complex for a single command-line string. You can provide a path to a file containing these instructions.
 
 ```bash
-weco --source examples/simple-mlx/optimize.py \
-     --eval-command "python examples/simple-mlx/evaluate.py --solution-path examples/simple-mlx/optimize.py" \
+cd examples/metal
+pip install mlx
+weco --source optimize.py \
+     --eval-command "python evaluate.py --solution-path optimize.py" \
      --metric speedup \
      --maximize true \
      --steps 30 \
      --model o3-mini \
-     --additional-instructions examples/simple-mlx/metal-examples.rst
+     --additional-instructions examples.rst
 ```
 
+**Example 3: Level Agnostic Optimization: Causal Self Attention with Triton & CUDA**
+
+Given how useful causal multihead self attention is to transformers, we've seen its wide adoption across ML engineering and AI research. Its great to keep things at a high-level (in PyTorch) when doing research, but when moving to production you often need to write highly customized low-level kernels to make things run as fast as they can. The `weco` CLI can optimize kernels across a variety of different abstraction levels and frameworks. Example 2 uses Metal but lets explore two more frameworks:
+
+1. [Triton](https://github.com/triton-lang/triton)
+    ```bash
+   cd examples/triton
+   pip install torch triton
+   weco --source optimize.py \
+        --eval-command "python evaluate.py --solution-path optimize.py" \
+        --metric speedup \
+        --maximize true \
+        --steps 30 \
+        --model gemini-2.5-pro-preview-03-25 \
+        --additional-instructions "Use triton to optimize the code while ensuring a small max float diff. Maintain the same code format."
+   ```
+
+2. [CUDA](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html)
+   ```bash
+   cd examples/cuda
+   pip install torch
+   weco --source optimize.py \
+        --eval-command "python evaluate.py --solution-path optimize.py" \
+        --metric speedup \
+        --maximize true \
+        --steps 30 \
+        --model gemini-2.5-pro-preview-03-25 \
+        --additional-instructions guide.md
+   ```
+
+
 ---
 
 ### Command Line Arguments
 
@@ -0,0 +1,157 @@
+import time
+import sys
+import os
+import pathlib
+import importlib
+import traceback
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+########################################################
+# Baseline
+########################################################
+class Model(nn.Module):
+    """
+    A vanilla multi-head masked self-attention layer with a projection at the end.
+    """
+
+    def __init__(self, n_embd, n_head, attn_pdrop, resid_pdrop, max_seqlen):
+        super().__init__()
+        assert n_embd % n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
+        # output projection
+        self.c_proj = nn.Linear(n_embd, n_embd)
+        # regularization
+        self.attn_dropout = nn.Dropout(attn_pdrop)
+        self.resid_dropout = nn.Dropout(resid_pdrop)
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer("bias", torch.tril(torch.ones(max_seqlen, max_seqlen)).view(1, 1, max_seqlen, max_seqlen))
+        self.n_head = n_head
+        self.n_embd = n_embd
+
+    def forward(self, x):
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_dropout(att)
+        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+
+
+########################################################
+# Weco Solution
+########################################################
+def load_module_from_path(module_path: str, add_to_sys_modules: bool = False):
+    # Clean out all old compiled extensions to prevent namespace collisions during build
+    module_path = pathlib.Path(module_path)
+    name = module_path.stem
+    spec = importlib.util.spec_from_file_location(name, module_path)
+    mod = importlib.util.module_from_spec(spec)  # type: ignore
+    if add_to_sys_modules:
+        sys.modules[name] = mod
+    spec.loader.exec_module(mod)  # type: ignore
+    return mod
+
+
+########################################################
+# Benchmark
+########################################################
+os.environ["MAX_JOBS"] = "1"  # number of workers for building with ninja
+
+
+def get_inputs(batch_size, seq_len, n_embd, device):
+    return torch.randn(batch_size, seq_len, n_embd, device=device, dtype=torch.float32)
+
+
+def bench(f, inputs, n_warmup, n_rep):
+    with torch.no_grad():
+        # warmup
+        for _ in range(n_warmup):
+            f(inputs)  # noqa
+
+        # benchmark
+        t_avg = 0.0
+        for _ in range(n_rep):
+            torch.cuda.empty_cache()  # Clear cache before timing
+            start_time = time.time()
+            f(inputs)
+            torch.cuda.synchronize()  # Wait for all computations to complete
+            t_avg += time.time() - start_time
+        t_avg /= n_rep * 1e-3
+        return t_avg
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--solution-path", type=str, required=True)
+    args = parser.parse_args()
+
+    # benchmarking parameters
+    n_correctness_trials = 10
+    n_warmup = 1000
+    n_rep = 5000
+
+    # init parameters
+    max_seqlen = 512
+    seq_len = 256
+    n_embd = 768
+    n_head = 8
+    # turn off dropout to measure correctness well
+    attn_pdrop = 0.0
+    resid_pdrop = 0.0
+
+    # input parameters
+    batch_size = 32
+
+    # load solution module
+    try:
+        torch.manual_seed(0)
+        solution_module = load_module_from_path(args.solution_path, add_to_sys_modules=False)
+        solution_model = solution_module.Model(
+            n_embd=n_embd, n_head=n_head, attn_pdrop=attn_pdrop, resid_pdrop=resid_pdrop, max_seqlen=max_seqlen
+        ).to("cuda")
+        assert isinstance(solution_model, nn.Module)
+    except Exception:
+        print(f"Candidate module initialization failed: {traceback.format_exc()}")
+        exit(1)
+
+    torch.manual_seed(0)
+    baseline_model = Model(
+        n_embd=n_embd, n_head=n_head, attn_pdrop=attn_pdrop, resid_pdrop=resid_pdrop, max_seqlen=max_seqlen
+    ).to("cuda")
+
+    # measure correctness
+    max_diff_avg = 0
+    for _ in range(n_correctness_trials):
+        inputs = get_inputs(batch_size=batch_size, seq_len=seq_len, n_embd=n_embd, device="cuda")
+        with torch.no_grad():
+            baseline_output = baseline_model(inputs)
+            optimized_output = solution_model(inputs)
+            max_diff_avg += torch.max(torch.abs(optimized_output - baseline_output))
+    max_diff_avg /= n_correctness_trials
+    print(f"max float diff between values of baseline and optimized model: {max_diff_avg}")
+
+    # measure performance
+    inputs = get_inputs(batch_size=batch_size, seq_len=seq_len, n_embd=n_embd, device="cuda")
+    t_avg_baseline = bench(baseline_model, inputs, n_warmup, n_rep)
+    print(f"baseline time: {t_avg_baseline:.2f}ms")
+    t_avg_optimized = bench(solution_model, inputs, n_warmup, n_rep)
+    print(f"optimized time: {t_avg_optimized:.2f}ms")
+    print(f"speedup: {t_avg_baseline / t_avg_optimized:.2f}x")
@@ -0,0 +1,113 @@
+# Writing In-line CUDA Kernels: 101
+
+This document outlines the strategy to improve speedup by writing fused and optimized CUDA kernels using a single-file implementation.
+
+## Requirements
+
+- **Single-File Implementation:** Develop fused CUDA kernels within one file.
+- **No Fallback Implementation:** Do not include any alternative or fallback code.
+- **Simplicity & Readability:** Write simple, easy-to-understand code and include clear comments.
+- **Avoid Templates:** Use plain fused kernel functions without templates.
+- **Multiple Kernels Allowed:** You can define more than one kernel in the file if needed.
+- **Model Class Requirement:** The solution must include a class `Model` (an instance of `nn.Module`), with the main computation in its `forward` method.
+- **Preserve Initialization:** Do not change the initialization of the `Model` class.
+- **Focus on Efficiency:** Concentrate solely on efficient PyTorch and CUDA coding without capturing logs.
+- **Error Handling:** Any terminal output or errors will be reviewed by an LLM for feedback.
+
+## GPU Hardware Specifications
+
+Here are some details on the hardware you have access to.
+
+```json
+{
+    "GPU Architecture": "Ampere",
+    "GPU Memory": "40GB",
+    "Memory Bandwidth": "1935 GB/s",
+    "FP64 TFLOPS": "9.7",
+    "FP64 Tensor Core TFLOPS": "19.5",
+    "FP32 TFLOPS": "19.5",
+    "TF32 Tensor Core TFLOPS": "156 (312 with sparsity)",
+    "BFLOAT16 Tensore Core TFLOPS": "312 (624 with sparsity)",
+    "FP16 Tensor Core TFLOPS": "312 (624 with sparsity)",
+    "INT8 Tensor Core TOPS": "624 (1248 with sparsity)",
+    "Register File Size": "64K 32-bit registers per SM",
+    "Maximum number of registers per thread": "255",
+    "Maximum number of thread blocks per SM": "32",
+    "Shared memory capacity per SM": "164 KB",
+    "Maximum shared memory per thread block": "163 KB"
+}
+```
+
+## Baseline Code
+
+The baseline implementation of the `Model` class simply performs an element-wise addition.
+
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, a, b):
+        return a + b
+```
+
+## Optimized Code
+
+The optimized version employs a custom CUDA kernel for fused element-wise addition. The kernel is defined and compiled inline using PyTorch's `load_inline`.
+
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.cpp_extension import load_inline
+
+# Define the custom CUDA kernel for element-wise addition
+elementwise_add_source = '''
+#include <torch/extension.h>
+#include <cuda_runtime.h>
+
+// CUDA kernel for element-wise addition
+__global__ void elementwise_add_kernel(const float* a, const float* b, float* out, int size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        out[idx] = a[idx] + b[idx];
+    }
+}
+
+// Launch function for the CUDA kernel
+torch::Tensor elementwise_add_cuda(torch::Tensor a, torch::Tensor b) {
+    auto size = a.numel();
+    auto out = torch::zeros_like(a);
+    const int block_size = 256;
+    const int num_blocks = (size + block_size - 1) / block_size;
+    elementwise_add_kernel<<<num_blocks, block_size>>>(a.data_ptr<float>(), b.data_ptr<float>(), out.data_ptr<float>(), size);
+    return out;
+}
+'''
+
+# C++ function prototype declaration
+elementwise_add_cpp_source = "torch::Tensor elementwise_add_cuda(torch::Tensor a, torch::Tensor b);"
+
+# Compile the inline CUDA code for element-wise addition
+elementwise_add = load_inline(
+    name="elementwise_add",
+    cpp_sources=elementwise_add_cpp_source,
+    cuda_sources=elementwise_add_source,
+    functions=["elementwise_add_cuda"],
+    verbose=True,
+    extra_cflags=[""],
+    extra_ldflags=[""],
+)
+
+class Model(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.elementwise_add = elementwise_add
+
+    def forward(self, a, b):
+        return self.elementwise_add.elementwise_add_cuda(a, b)
+```
@@ -0,0 +1,44 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Model(nn.Module):
+    """
+    A vanilla multi-head masked self-attention layer with a projection at the end.
+    """
+
+    def __init__(self, n_embd, n_head, attn_pdrop, resid_pdrop, max_seqlen):
+        super().__init__()
+        assert n_embd % n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
+        # output projection
+        self.c_proj = nn.Linear(n_embd, n_embd)
+        # regularization
+        self.attn_dropout = nn.Dropout(attn_pdrop)
+        self.resid_dropout = nn.Dropout(resid_pdrop)
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer("bias", torch.tril(torch.ones(max_seqlen, max_seqlen)).view(1, 1, max_seqlen, max_seqlen))
+        self.n_head = n_head
+        self.n_embd = n_embd
+
+    def forward(self, x):
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_dropout(att)
+        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y