Lightning-AI
diff --git a/‎.azure/docker-build.yml‎
Lines changed: 5 additions & 5 deletions b/‎.azure/docker-build.yml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.azure/gpu-coverage.yml‎
Lines changed: 2 additions & 2 deletions b/‎.azure/gpu-coverage.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.azure/gpu-tests.yml‎
Lines changed: 9 additions & 9 deletions b/‎.azure/gpu-tests.yml‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎.azure/notebook-runs.yml‎
Lines changed: 3 additions & 3 deletions b/‎.azure/notebook-runs.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.lightning/workflows/all-tests.yaml‎
Lines changed: 6 additions & 6 deletions b/‎.lightning/workflows/all-tests.yaml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎.lightning/workflows/notebooks.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.lightning/workflows/notebooks.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.lightning/workflows/transformer-engine.yaml‎
Lines changed: 3 additions & 2 deletions b/‎.lightning/workflows/transformer-engine.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 38 additions & 46 deletions b/‎README.md‎
Lines changed: 38 additions & 46 deletions
diff --git a/‎dockers/README.md‎
Lines changed: 3 additions & 3 deletions b/‎dockers/README.md‎
Lines changed: 3 additions & 3 deletions
@@ -38,17 +38,17 @@ jobs:
   - job: build_push
     strategy:
       matrix:
-        "cuda 12.6 | torch 2.8.0 | cudnn FE v1.10.0":
-          { CUDA_VERSION: "12.6.3", TORCH_VERSION: "2.8.0", TRITON_VERSION: "3.4.0", CUDNN_FRONTEND_VERSION: "1.10.0" }
-        "cuda 12.6 | torch nightly | cudnn FE v1.10.0":
-          { CUDA_VERSION: "12.6.3", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND_VERSION: "1.10.0" }
+        "cuda 12.8 | torch 2.8.0 | cudnn FE v1.15.0":
+          { CUDA_VERSION: "12.8.1", TORCH_VERSION: "2.8.0", TRITON_VERSION: "3.4.0", CUDNN_FRONTEND_VERSION: "1.15.0" }
+        "cuda 12.8 | torch nightly | cudnn FE v1.15.0":
+          { CUDA_VERSION: "12.8.1", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND_VERSION: "1.15.0" }
         #'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
     timeoutInMinutes: "95"
     variables:
       UBUNTU_VERSION: "24.04"
-      PYTHON_VERSION: "3.10"
+      PYTHON_VERSION: "3.12"
       imageRepository: "pytorchlightning/lightning-thunder"
       imageTag: "ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND_VERSION)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}"
     pool: "lit-rtx-3090"
 
@@ -21,7 +21,7 @@ jobs:
     strategy:
       matrix:
         "w/ torch 2.7.1":
-          docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.7.1-dev"
+          docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
     pool: "lit-rtx-3090"
@@ -65,7 +65,7 @@ jobs:
           chmod +x codecov
 
           # install this package
-          python setup.py develop
+          pip install -e .
         displayName: "Install package & ..."
 
       - bash: bash scripts/sanity-check.sh
 
@@ -16,28 +16,28 @@ jobs:
     strategy:
       matrix:
         "main w/ torch 2.8.0":
-          docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
+          docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
           testing: "main"
         "ops w/ torch 2.8.0":
-          docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
+          docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
           testing: "ops"
         "grads w/ torch 2.8.0":
-          docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
+          docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
           testing: "grads"
         "distributed w/ torch 2.8.0":
-          docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
+          docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
           testing: "distributed"
         "main w/ torch-nightly":
-          docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
+          docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
           testing: "main"
         "ops w/ torch-nightly":
-          docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
+          docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
           testing: "ops"
         "grads w/ torch-nightly":
-          docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
+          docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
           testing: "grads"
         "distributed w/ torch-nightly":
-          docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
+          docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
           testing: "distributed"
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
@@ -82,7 +82,7 @@ jobs:
           chmod +x codecov
 
           # install this package
-          python setup.py develop
+          pip install -e .
         displayName: "Install package & ..."
 
       - bash: bash scripts/sanity-check.sh
 
@@ -16,9 +16,9 @@ jobs:
     strategy:
       matrix:
         "notebooks w/ torch 2.8":
-          docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
+          docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
         "notebooks w/ torch-nightly":
-          docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
+          docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
     # how long to run the job before automatically cancelling
     timeoutInMinutes: "45"
     # how much time to give 'run always even if cancelled tasks' before stopping them
@@ -53,7 +53,7 @@ jobs:
           cat requirements/base.txt
           pip install -U -r requirements/notebooks.txt
           # install this package
-          python setup.py develop
+          pip install -e .
           # double check on test requirements
           echo "Install special requirements for notebooks"
         displayName: "Install package & ..."
 
@@ -9,24 +9,24 @@ interruptible: False
 parametrize:
   matrix:
     image:
-      - "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
-      - "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
+      - "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
+      - "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
     testing: ["main", "ops", "grads"]
     machine: ["L4"]
   exclude: []
   include:
-    - image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
+    - image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
       testing: "distributed"
       machine: "L4_X_2"
-    - image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
+    - image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
       testing: "distributed"
       machine: "L4_X_2"
 
 env:
   CI: "true" # skip some tests with CI
   NCCL_DEBUG: "INFO"
   NCCL_IGNORE_DISABLED_P2P: "1"
-  TORCH_VERSION: "2.7.1"
+  TORCH_VERSION: "2.8.0"
   CUDA_LAUNCH_BLOCKING: "1" # for debugging purposes, to get better stack traces
 
 run: |
@@ -49,7 +49,7 @@ run: |
   chmod +x codecov
 
   # install this package
-  python setup.py develop
+  pip install -e .
 
   bash scripts/sanity-check.sh
 
 
@@ -10,8 +10,8 @@ interruptible: False
 parametrize:
   matrix:
     image:
-      - "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
-      - "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
+      - "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
+      - "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
   exclude: []
   include: []
 
@@ -29,7 +29,7 @@ run: |
   # double check on test requirements
   pip install -q -U -r requirements/base.txt -r requirements/notebooks.txt
   # install this package
-  python setup.py develop
+  pip install -e .
 
   bash scripts/sanity-check.sh
 
 
@@ -7,7 +7,7 @@ trigger:
 timeout: "30" # minutes
 machine: "L4"
 interruptible: False
-image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
+image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
 parametrize:
   matrix:
     test_file:
@@ -20,11 +20,12 @@ run: |
   pip list
   set -ex
 
+  pip install wheel
   # conda install -c conda-forge libstdcxx-ng
   # sudo apt install libstdc++6 libstdc++-*-dev
   pip install . -U -q -r requirements/test.txt
   # Need to explicitly point to cudnn.h as it is installed at a non-standard location
   # Ref: https://github.com/NVIDIA/TransformerEngine/issues/918#issuecomment-2187703769
-  CPLUS_INCLUDE_PATH="/usr/local/lib/python3.10/dist-packages/nvidia/cudnn/include/" pip install --no-build-isolation 'transformer_engine[pytorch]'
+  CPLUS_INCLUDE_PATH="/usr/local/lib/python3.12/dist-packages/nvidia/cudnn/include/" pip install --no-build-isolation 'transformer_engine[pytorch]'
   pip list # for debugging purposes
   pytest thunder/tests/${test_file} -v -rs
@@ -13,18 +13,10 @@
 &#160;
 
 <strong>Source-to-source compiler for PyTorch.</strong>
-Fast. Understandable. Extensible.
+Understandable. Inspectable. Extensible.
 
 </div>
 
-______________________________________________________________________
-
-**Thunder** makes optimizing PyTorch models easy, augmenting them with custom kernels, fusions, quantization, distributed strategies, and more.
-
-For **end users**, Thunder comes with plugins that provide model speed-ups out of the box, for optimal utilization of last generation hardware.
-
-For **performance experts**, Thunder is the most ergonomic framework for understanding, modifying, and optimizing AI models through composable transformations.
-
 <div align='center'>
 
 <pre>
@@ -36,6 +28,28 @@ For **performance experts**, Thunder is the most ergonomic framework for underst
 
 </div>
 
+Thunder is a source-to-source deep learning compiler for PyTorch that focuses on making it simple to optimize models for training and inference.
+
+It provides:
+
+- a simple, Pythonic IR capturing the entire computation
+- a rich system of transforms that simultaneously operate on the computation IR, the model, and the weights
+- an extensible dispatch mechanism to fusers and optimized kernel libraries
+
+With Thunder you can:
+
+- profile deep learning programs easily, map individual ops to kernels and inspect programs interactively
+- programmatically replace sequences of operations with optimized ones and see the effect on performance
+- acquire full computation graphs without graph breaks by flexibly extending the interpreter
+- modify programs to fully utilize bleeding edge kernel libraries on specific hardware
+- write models for single GPU and transform them to run distributed
+- quickly iterate on mixed precision and quantization strategies to search for combinations that minimally affect quality
+- bundle all optimizations in composable recipes, so they can be ported across model families
+
+Ultimately, you should think about Thunder as a highly efficient tool to go from “unoptimized” to “optimized”.
+
+If that is of interest for you, read on to Install Thunder and get started quickly.
+
 <div align='center'>
 
 [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/Lightning-AI/lightning-thunder/blob/main/LICENSE)
@@ -168,7 +182,7 @@ torch.testing.assert_close(y, model(x))
 
 ## Examples
 
-### Speed up LLM training
+### LLM training
 
 Install LitGPT (without updating other dependencies)
 
@@ -194,7 +208,7 @@ out = thunder_model(inp)
 out.sum().backward()
 ```
 
-### Speed up HuggingFace BERT inference
+### HuggingFace BERT inference
 
 Install Hugging Face Transformers (recommended version is `4.50.2` and above)
 
@@ -228,7 +242,7 @@ out = thunder_model(**inp)
 print(out)
 ```
 
-### Speed up HuggingFace DeepSeek R1 distill inference
+### HuggingFace DeepSeek R1 distill inference
 
 Install Hugging Face Transformers (recommended version is `4.50.2` and above)
 
@@ -264,22 +278,7 @@ out = thunder_model.generate(
 print(out)
 ```
 
-To get an idea of the speedups, just run
-
-```bash
-python examples/quickstart/hf_llm.py
-```
-
-Here what you get on a L4 machine from [Lightning Studio](https://lightning.ai):
-
-```bash
-Eager: 2273.22ms
-Thunder: 1254.39ms
-```
-
-81% faster 🏎️! Quite the speedup ⚡️
-
-### Speed up Vision Transformer inference
+### Vision Transformer inference
 
 ```python
 import thunder
@@ -300,28 +299,21 @@ thunder_model = thunder.compile(model)
 out = thunder_model(inp)
 ```
 
-### Benchmarking HF models
+### Benchmarks
 
-The script `examples/quickstart/hf_benchmarks.py` demonstrates how to benchmark a model for text generation, forward pass, forward pass with loss, and a full forward + backward computation.
+Although is Thunder a tool for optimizing models, rather than an opaque compiler that gets you speedups out of the box, here is a set of benchmarks.
 
-On an H100 with torch=2.7.0 and nvfuser-cu126-torch27, running deepseek-ai/DeepSeek-R1-Distill-Llama-1.5B, the thunder executors (NVFuser and torch.compile) achieve the following speedups:
+Perf-wise, out of the box Thunder is in the ballpark of torch compile, especially when using CUDAGraphs. Note however that Thunder is not a competitor to torch compile! It can actually use torch compile as one of its fusion executors.
 
-```
-Text generation:
-Thunder (nvfuser): 3.36× faster
-Thunder (torch.compile): 3.42× faster
+The script `examples/quickstart/hf_llm.py` demonstrates how to benchmark a model for text generation, forward pass, forward pass with loss, and a full forward + backward computation.
 
-Forward pass:
-Thunder (nvfuser): 1.51× faster
-Thunder (torch.compile): 1.63× faster
+On an H100 with torch=2.8.0 and nvfuser-cu128-torch28 and Transformers 4.55.4 running Llama 3.2 1B we see the following timings:
 
-Forward pass + loss:
-Thunder (nvfuser): 1.55× faster
-Thunder (torch.compile): 1.64× faster
-
-Forward + backward:
-Thunder (nvfuser): 1.51× faster
-Thunder (torch.compile): 1.69× faster
+```
+Transformers with torch.compile and CUDAGraphs (reduce-overhead mode):  521ms
+Transformers with torch.compile but no CUDAGraphs (default mode):       814ms
+Transformers without torch.compile:                                    1493ms
+Thunder with CUDAGraphs:                                                542ms
 ```
 
 ## Plugins
@@ -352,7 +344,7 @@ Thunder works in three stages:
 
 1. ⚡️ It acquires your model by interpreting Python bytecode and producing a straight-line Python program
 
-1. ️⚡️ It transforms the computation trace to make it distributed, change precision
+1. ️⚡️ It transforms the model and computation trace to make it distributed, change precision
 
 1. ⚡️ It routes parts of the trace for execution
 
 
@@ -6,14 +6,14 @@ You can build it on your own, note it takes lots of time, be prepared.
 
 ```bash
 # build with specific arguments
-docker image build -t lightning:ubuntu-cuda-py3.10-cuda12.1.1 -f dockers/ubuntu-cuda/Dockerfile --build-arg "CUDA_VERSION=12.1.1" .
+docker image build -t lightning:ubuntu-cuda-py3.12-cuda12.8 -f dockers/ubuntu-cuda/Dockerfile --build-arg "CUDA_VERSION=12.1.1" .
 ```
 
 To run your docker use
 
 ```bash
 docker image list
-docker run --rm -it pytorch-lightning:ubuntu-cuda-py3.10-cuda11.7.0 bash
+docker run --rm -it pytorch-lightning:ubuntu-cuda-py3.12-cuda12.8 bash
 ```
 
 ## Run docker image with GPUs
@@ -33,5 +33,5 @@ sudo systemctl restart docker
 and later run the docker image with `--gpus=all`. For example,
 
 ```bash
-docker run --rm -it --gpus=all pytorchlightning/lightning:ubuntu-cuda-py3.10-cuda12.1.0
+docker run --rm -it --gpus=all pytorchlightning/lightning:ubuntu-cuda-py3.12-cuda12.1.0
 ```