mattteochen · mattteochen · Jul 2, 2024 · Jul 2, 2024 · Jul 4, 2024 · Jul 9, 2024
diff --git a/examples/autotuner/.gitignore b/examples/autotuner/.gitignore
@@ -0,0 +1,4 @@
+*.log
+*.txt
+*.pickle
+*.nsys-rep
diff --git a/examples/autotuner/LLaMAMLP.py b/examples/autotuner/LLaMAMLP.py
@@ -0,0 +1,55 @@
+"""
+This benchmark script is intended to demonstrate the autotuner on a generic model.
+No executor are given leaving full responsibility to Thunder.
+"""
+
+import torch
+import thunder
+from thunder.benchmarks.utils import torch_timer_total_benchmark, torch_total_benchmark
+
+
+class LLaMAMLP(torch.nn.Module):
+    def __init__(self, n_embd, intermediate_size) -> None:
+        super().__init__()
+        self.fc_1 = torch.nn.Linear(n_embd, intermediate_size, bias=False)
+        self.fc_2 = torch.nn.Linear(n_embd, intermediate_size, bias=False)
+        self.proj = torch.nn.Linear(intermediate_size, n_embd, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_fc_1 = self.fc_1(x)
+        x_fc_2 = self.fc_2(x)
+        x = torch.nn.functional.silu(x_fc_1) * x_fc_2
+        return self.proj(x)
+
+
+with torch.device("cuda"):
+    mult = 2
+    a = 4096 * mult
+    b = 11008 * mult
+    x = torch.randn(4, 2048, a, requires_grad=True)
+
+    model = LLaMAMLP(a, b)
+
+    eager = model
+    torchcompile = torch.compile(model)
+    jmodel_def = thunder.jit(model)
+    jmodel_auto = thunder.jit(
+        model,
+        autotune_type="runtime",
+        autotune_enable_te=True,
+        autotune_nv_enable_options=True,
+        model_name="LLaMAMLP",
+        autotune_save_configuration=True,
+    )
+
+    print("deviation def:", (jmodel_def(x) - model(x)).abs().max().item())
+    print("deviation auto:", (jmodel_auto(x) - model(x)).abs().max().item())
+
+    iters = 100
+    callables = [eager, torchcompile, jmodel_def, jmodel_auto]
+    labels = ["eager", "torchcompile", "Thunder", "Thunder Autotuned"]
+    inputs = [x, x, x, x]
+    print("\nResults with torch total benchmark:")
+    torch_total_benchmark(callables, labels, inputs, iters)
+    print("\nResults with torch timer benchmark:")
+    torch_timer_total_benchmark(callables, labels, inputs, "LlamaMLP")
diff --git a/examples/autotuner/litGPT.py b/examples/autotuner/litGPT.py
@@ -0,0 +1,101 @@
+"""
+This script benchmarks litGPT models in a easier way wrt thunder.benchmarks.benchmark_litgpt.py with a fake training loop with no optimizers.
+"""
+
+from litgpt import GPT
+from thunder.benchmarks.utils import torch_total_benchmark, torch_timer_total_benchmark
+from thunder.tests.litgpt_model import Config
+import thunder
+import torch
+import time
+from pprint import pprint
+
+torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
+torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
+
+# import os
+# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+
+class LitGPTModelThunderConfig:
+    def __init__(
+        self,
+        layers: int,
+        autotune_type: str,
+        batch_size: int,
+        seq_len: int = -1,
+        model_name: str = "Llama-3-8B",
+        executors=None,
+        optimize_transformer_blocks=True,
+        optimize_transformer_min_block_size=60,  # for llama3
+    ) -> None:
+        self.layers = layers
+        self.autotune_type = autotune_type
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self.model_name = model_name
+        self.executors = executors
+        self.optimize_transformer_blocks = optimize_transformer_blocks
+        self.optimize_transformer_min_block_size = optimize_transformer_min_block_size
+
+
+to_run = [
+    LitGPTModelThunderConfig(
+        1,
+        "runtime",
+        2,
+        executors=[
+            "cudnn",
+            "sdpa",
+            "fa3",
+            "nvfuser",
+            "nvmath",
+            "torchcompile",
+        ],
+    ),
+]
+
+for test in to_run:
+    try:
+        cfg = Config.from_name(test.model_name)
+        cfg.n_layer = test.layers
+        if test.seq_len != -1:
+            cfg.block_size = test.seq_len
+        torch.set_default_dtype(torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16)
+        pprint(cfg)
+        print("Batch size:", test.batch_size)
+        with torch.device("cuda"):
+            model = GPT(cfg)
+            x = torch.randint(1, model.config.vocab_size, (test.batch_size, cfg.block_size))
+            target = torch.ones_like(x)
+
+            eager = model
+            torch_compile = torch.compile(model)
+            jmodel_def = thunder.jit(model)
+            jmodel_auto = thunder.jit(
+                model,
+                autotune_type=test.autotune_type,
+                executors=test.executors,
+                autotune_optimize_common_blocks=test.optimize_transformer_blocks,
+                autotune_optimize_common_blocks_min_size=test.optimize_transformer_min_block_size,
+            )
+            print("deviation def:", (jmodel_def(x) - model(x)).abs().max().item())
+            s = time.time_ns()
+            print("deviation auto:", (jmodel_auto(x) - model(x)).abs().max().item())
+            e = time.time_ns()
+            print("Compilation time:", {(e - s) / 1000000000}, "s")
+
+            iters = 100
+            callables = [eager, torch_compile, jmodel_def, jmodel_auto]
+            labels = ["eager", "torch.compile", "Thunder", "Thunder Autotuner"]
+            inputs = [x, x, x, x]
+            print(f"\nResults torch total benchmark ({iters} iters):")
+            torch_total_benchmark(callables, labels, inputs, iters, torch.nn.functional.cross_entropy)
+            print(f"\nResults torch timer benchmark ({iters} iters):")
+            torch_timer_total_benchmark(callables, labels, inputs, test.model_name, torch.nn.functional.cross_entropy)
+
+            print(f'Executors employed: {thunder.executors_applied(jmodel_auto)}')
+    except Exception as e:
+        print(f"Benchmark failed:\n{e}")
+        import traceback
+
+        traceback.print_exc()
diff --git a/thunder/__init__.py b/thunder/__init__.py
@@ -266,6 +266,7 @@ def jit(
     disable_torch_autograd: bool = False,  # TODO Revisit this UX for RC1
     transforms: list[Transform] | None = None,
     record_history: bool = False,
+    # autotune_type: Any | None = None,
     **compile_options,  # TODO RC1 Make this explicit -- dict of options
 ) -> Callable:
     """Just-in-time compile a callable (function or model).
@@ -292,7 +293,18 @@ def jit(
                - ``"same input"`` - don't check, but just assume that a cached function works if it exists.
 
         transforms: List of transforms to be applied. It should be an instance :class:`thunder.core.transforms.Transform`. Default: ``None``
+
+        autotune_type: string representing the required autotuner performance target (``"runtime"`` or ``"memory"``).
+        autotune_nv_enable_options: boolean to enable nvFuser compilation options autotuning. Currently at most one option will be used. Default: ``"False"``
+        autotune_enable_te: boolean to enable TransformerEngineFP8 executor autotuning. Default: ``"False"``
+        autotune_optimize_common_blocks: boolean to enable trace's common block optimization during the compilation (for example transformer layers). This optimization can be used if you are working with a model with repeated block structures as transformer based models. You don't need to know
+                                         where a block starts or ends as it's handled automatically. Default: ``"False"``
+        autotune_optimize_common_blocks_min_size: integer to control the minimum block length to trigger the common block optimization. Default: ``-1``
+        autotune_save_configuration: boolean to produce a configuration file for the current model. This configuration can be loaded afterwards with ``"autotune_restore_configuration"``. Default ``"False"``
+        autotune_restore_configuration: string containing the cached configuration file name with the relative path to the script invocation.
+        model_name: string containing the current model name used during the configuration file creation in ``"autotune_save_configuration"``. A default one is used if this is not provided.
     """
+    from thunder.backend_optimizer.optimizer import OptimizerType
 
     if "executors_list" in compile_options:
         warnings.warn("outdated argument executors_list= in call, please use executors=")
@@ -308,6 +320,41 @@ def jit(
     if transforms is None:
         transforms = []
 
+    required_autotune = compile_options.get("autotune_type", None)
+    if required_autotune is not None:
+        if required_autotune not in ["runtime", "memory"]:
+            raise AssertionError(f"Not supported optimization: {required_autotune}")
+
+        compile_options |= {
+            "autotune_type": OptimizerType.RUNTIME if required_autotune == "runtime" else OptimizerType.MEMORY,
+            "autotune_executors_placed_by_fw_bw_split": set(),
+        }
+
+        # Default the executors list to all_executors if no options are given
+        # Otherwise the user restricted choice will be used
+        from thunder.executors.transformer_engineex import transformer_engine_ex
+        from thunder.executors.pythonex import ex as python_ex
+        if not executors:
+            executors = get_all_executors()
+            # Remove pythonex
+            executors = [ex for ex in executors if ex != python_ex]
+            # Remove transformer_engine if not requested
+            executors = [
+                ex
+                for ex in executors
+                if ex != transformer_engine_ex
+                or (ex == transformer_engine_ex and compile_options.get("autotune_enable_te", False))
+            ]
+        else:
+            # If TE is in executors list we have to enable the compilation option
+            if transformer_engine_ex in executors:
+                compile_options['autotune_enable_te'] = True
+
+        from thunder.backend_optimizer.utils import reorder_executors_list
+        executors = reorder_executors_list(
+            executors, autotune_enable_te=compile_options.get("autotune_enable_te", False)
+        )
+
     # Resolve names of executors
     executors = resolve_executors(executors)
 
@@ -450,6 +497,7 @@ def get_computation_and_inputs(*args, **kwargs):
                     cs.last_traces = comp_traces
                     cs.last_interpreted_instructions = None
                     cs.last_interpreter_log = None
+                    cs.last_executors = cd.executors_list
                     cs.last_prologue_traces = pro_traces
                     cs.last_prologue = pro
                     cs.last_prologue_transformation_start = 0
@@ -485,6 +533,7 @@ def get_computation_and_inputs(*args, **kwargs):
                 cs.last_traces = comp_traces
                 cs.last_interpreted_instructions = None
                 cs.last_interpreter_log = None
+                cs.last_executors = cd.executors_list
                 cs.last_prologue_traces = pro_traces
                 cs.last_prologue = pro
 
@@ -605,6 +654,7 @@ def get_computation_and_inputs(*args, **kwargs):
             cs.last_prologue_traces = prologue_traces
             cs.last_prologue = pro
             cs.last_traces = computation_traces
+            cs.last_executors = cd.executors_list
             backward_traces = []
             cs.last_backward_traces = backward_traces
             cs.last_interpreter_log = last_interpreter_log
@@ -631,22 +681,44 @@ def get_computation_and_inputs(*args, **kwargs):
                     # Note computation_trc and backward_trc have been appended to cs.last_(backward_)traces
                     # by split_forward_backward
 
+                    # Reset the cache for the next compilation
+                    cd.autotuner_bsym_with_gradfn_executor_cache = {}
+
             if backward_trc is None:
                 from thunder.executors.passes import transform_for_execution as transform_for_execution_pass
+                from thunder.executors.passes import autotune_transform_for_execution
                 from thunder.executors.passes import _transform_for_operator_executor_execution
                 from thunder.distributed.utils import maybe_sort_waits
+                from thunder.backend_optimizer.optimizer import BackendOptimizer, TraceType
 
                 tmp_comp_trc = _transform_for_operator_executor_execution(computation_trc, cd.executors_list)
                 is_transformed, tmp_comp_trc = maybe_sort_waits(tmp_comp_trc)
                 if is_transformed:
                     computation_trc = tmp_comp_trc
                     computation_traces.append(computation_trc)
 
-                extraces = transform_for_execution(
-                    computation_trc,
-                    executors_list=cd.executors_list,
-                    use_del_last_used=False,
-                )
+                autotune = cd.compile_options.get('autotune_type', None)
+                if autotune is None:
+                    extraces = transform_for_execution(
+                        computation_trc,
+                        executors_list=cd.executors_list,
+                        use_del_last_used=False,
+                    )
+                else:
+                    optimizer_ctx = BackendOptimizer(
+                        priority_executors=cd.executors_list,
+                        apply_bucketing_bw_trace=False,
+                        produce_log=False,
+                        optimizer_type=autotune,
+                        compile_data=cd,
+                    )
+                    extrace = autotune_transform_for_execution(
+                        optimizer_context=optimizer_ctx,
+                        trace=computation_trc,
+                        trace_type=TraceType.FW,
+                        is_computational=True
+                    )
+                    extraces = [extrace]
                 computation_traces.extend(extraces)
                 computation_trc = computation_traces[-1]
 
@@ -834,6 +906,19 @@ def last_prologue_traces(fn) -> TraceCtx:
     return cs.last_prologue_traces
 
 
+def executors_applied(fn) -> Sequence[Executor]:
+    """Obtains the list of executors that have been applied to the computational trace.
+    If the backward trace is not None, the list will include also executors used in the backward trace.
+
+    """
+    cs = compile_stats(fn)
+    if cs is None:
+        raise TypeError(f"{fn} doesn't seem to be a thunder compiled function.")
+    if cs.last_executors is None:
+        raise TypeError(f"{fn} doesn't seem to have been called yet.")
+    return cs.last_executors
+
+
 def cache_option(fn) -> CACHE_OPTIONS:
     """Returns the cache options set when JITting the function."""
     cd = compile_data(fn)