sutro/gpu_toy.py at main · cybertronai/sutro · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import modal
import time
import subprocess

app = modal.App("gpu-toy-problem")

image = modal.Image.debian_slim(python_version="3.11").pip_install("torch")

# Modal L4 pricing (https://modal.com/pricing)
L4_COST_PER_HOUR = 0.84  # USD/GPU-hour
L4_COST_PER_SEC = L4_COST_PER_HOUR / 3600


@app.function(gpu="L4", image=image)
def gpu_toy():
    """Run a toy problem on an NVIDIA L4 GPU."""
    import torch

    container_start = time.time()

    # Show GPU info via nvidia-smi
    result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
    print(result.stdout)

    # Toy problem: large matrix multiply on GPU
    device = torch.device("cuda")
    print(f"PyTorch sees: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

    N = 4096
    print(f"\nMultiplying two {N}x{N} matrices on GPU...")

    A = torch.randn(N, N, device=device)
    B = torch.randn(N, N, device=device)

    # Warm up
    torch.cuda.synchronize()
    C = A @ B
    torch.cuda.synchronize()

    # Timed run
    start = time.time()
    for _ in range(100):
        C = A @ B
    torch.cuda.synchronize()
    compute_elapsed = time.time() - start

    container_elapsed = time.time() - container_start
    tflops = (2 * N**3 * 100) / compute_elapsed / 1e12
    print(f"100 matmuls in {compute_elapsed:.3f}s  ({tflops:.2f} TFLOPS)")
    print(f"Result shape: {C.shape}, sum: {C.sum().item():.2f}")

    return {
        "compute_elapsed": compute_elapsed,
        "container_elapsed": container_elapsed,
        "tflops": tflops,
    }


@app.local_entrypoint()
def main():
    wall_start = time.time()
    result = gpu_toy.remote()
    wall_elapsed = time.time() - wall_start

    # Modal bills per-second of container uptime (wall time includes startup)
    gpu_cost = wall_elapsed * L4_COST_PER_SEC

    print(f"\n{'=' * 55}")
    print(f"  GPU TFLOPS:          {result['tflops']:.2f}")
    print(f"  GPU compute time:    {result['compute_elapsed']:.3f}s")
    print(f"  Container time:      {result['container_elapsed']:.1f}s")
    print(f"  Total wall time:     {wall_elapsed:.1f}s  (incl. startup)")
    print(f"  Estimated cost:      ${gpu_cost:.4f}  "
          f"(L4 @ ${L4_COST_PER_HOUR}/hr)")
    print(f"{'=' * 55}")