lsds
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 12 additions & 8 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎repro/README.md‎
Lines changed: 46 additions & 4 deletions b/‎repro/README.md‎
Lines changed: 46 additions & 4 deletions
diff --git a/‎repro/build_run_container.sh‎
Lines changed: 35 additions & 10 deletions b/‎repro/build_run_container.sh‎
Lines changed: 35 additions & 10 deletions
diff --git a/‎repro/data_loading.py‎
Lines changed: 32 additions & 23 deletions b/‎repro/data_loading.py‎
Lines changed: 32 additions & 23 deletions
diff --git a/‎repro/expected_results/plots/llama32/block_size/block.png‎
70 KB b/‎repro/expected_results/plots/llama32/block_size/block.png‎
70 KB
diff --git a/‎repro/expected_results/plots/llama32/compilation/compilation_breakdown_multiple.png‎
86.5 KB b/‎repro/expected_results/plots/llama32/compilation/compilation_breakdown_multiple.png‎
86.5 KB
diff --git a/‎repro/expected_results/plots/llama32/mem/runtime_mem.png‎
66.8 KB b/‎repro/expected_results/plots/llama32/mem/runtime_mem.png‎
66.8 KB
diff --git a/‎repro/expected_results/plots/llama32/tpt/causal_16.png‎
44.8 KB b/‎repro/expected_results/plots/llama32/tpt/causal_16.png‎
44.8 KB
diff --git a/‎repro/expected_results/plots/llama32/tpt/causal_4.png‎
43.8 KB b/‎repro/expected_results/plots/llama32/tpt/causal_4.png‎
43.8 KB
@@ -29,6 +29,7 @@ test_run/
 debug_run/
 debug_runs/
 results/
+results_profile/
 examples/experiments/attn_microbench/results/
 data/
 !tempo/api/data/
 
@@ -1,17 +1,23 @@
-default_stages: [ "commit", "commit-msg", "push" ]
+default_stages: ["pre-commit", "commit-msg", "pre-push"]
 default_language_version:
   python: python3
 
 exclude: ^examples/llama/
 
 repos:
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.20.0
+    hooks:
+      - id: pyupgrade
+        args: [--py310-plus]
+
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
     rev: v0.12.1
     hooks:
       # Run the linter.
       - id: ruff
-        args: [ --fix ]
+        args: [--fix]
         files: ^tempo/
       # Run the formatter.
       - id: ruff-format
@@ -29,10 +35,10 @@ repos:
         name: "Mixed line ending fixer"
       - id: check-yaml
         name: "Yaml checker"
-        args: [ '--unsafe' ]
+        args: ["--unsafe"]
       - id: trailing-whitespace
         name: "Trailing whitespace fixer"
-        args: ['--markdown-linebreak-ext=md']
+        args: ["--markdown-linebreak-ext=md"]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.16.1
@@ -41,15 +47,13 @@ repos:
         name: "Static type checker"
         files: tempo/.*\.py$
 
-
   - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
     rev: v9.22.0
     hooks:
       - id: commitlint
         name: "Commit linter"
-        stages: [ commit-msg ]
-        additional_dependencies: [ '@commitlint/config-conventional' ]
-
+        stages: [commit-msg]
+        additional_dependencies: ["@commitlint/config-conventional"]
 
   - repo: https://github.com/kynan/nbstripout
     rev: 0.8.1
 
@@ -60,9 +60,9 @@ repro                                        # Package containing all reproducib
 │  
 ├── expected_results/                        # PNG examples of the expected plots and speedup analysis
 │  
-├── sec7_2_lm_decode/                        # Scripts for running and plotting Section 7.2's experiments
+├── sec7_2_[lm/llama32]_decode/              # Scripts for running and plotting Section 7.2's experiments
 │   │  
-│   ├── impls/                               # Implementations of GPT2's architecture in JAX/Torch/Tempo
+│   ├── impls/                               # Implementations of [GPT2/Llamas]'s architecture in JAX/Torch/Tempo
 │   ├── plot/                                # Plotting scripts for Section 7.2
 │   │   ├── plot_gpt2_time_per_token.py      # Script to plot Figure 9 and 10
 │   │   ├── plot_block_size.py               # Script to plot Figure 11
@@ -138,12 +138,12 @@ We have aimed to make this process as simple as possible:
 
 git clone https://github.com/lsds/Tempo/ tempo
 cd tempo
-chmod +x repro/build_run_container.sh
+chmod +x repro/build_run_container.sh [--llama32]
 
 ./repro/build_run_container.sh
 
 # Now in container
-chmod +x repro/run_all_exprs_and_plot.sh
+chmod +x repro/run_all_exprs_and_plot.sh [--llama32]
 ./repro/run_all_exprs_and_plot.sh
 
 # Before exiting the container, in another shell, copy results out of container
@@ -156,6 +156,13 @@ ssh -4 <HOST> "tar -c -C /home/<USER> /path/to/plots | xz -c" | xz -d | tar -x
 
 ```
 
+## Llama-3.2 Experiments
+
+The reproducer must first obtain a copy of the model by:
+1. Requesting model access from [huggingface](https://huggingface.co/meta-llama/Llama-3.2-3B)
+2. Running 'llama model download --source huggingface --model-id meta-llama/Llama-3.2-3B' to download a checkpoint into ~/.llama/checkpoints
+3. Then follow the previous section, passing --llama32 to the bash scripts invoked.
+
 ## Working with LaunchLib
 
 We developed a tiny library for parallelizing experiments across gpus.
@@ -249,6 +256,7 @@ changed, and thus, some results have changed, often for the better.
 We have attempted to disable certain optimizations, where needed, in order to more closely
 match the original results.
 
+
 ### Section 7.2 - GPT-2 Decoding
 
 **Figure 9 - Mean Time per Token with Causal Attention**
@@ -289,6 +297,40 @@ the best tile size for batch size of 64 has shifted to 1024 (instead of 512).
 
 Results match up exactly with original submission. Tempo's circular tensor store uses a single static allocation for windowed attention. Causal attention is decomposed into blocks which are allocated as needed at runtime, causing the step-like behaviour observed.
 
+### Section 7.2 - Llama3.2-3B Decoding
+
+These experiments were not present in the original submission, but have been added to the final version of the paper.
+
+
+**Figure 17a - Causal attention at batch size 16**
+
+![Mean time between tokens with causal attention with batch size 16](expected_results/plots/llama32/tpt/causal_16.png)
+
+
+**Figure 17b - Causal attention at batch size 4**
+
+![Mean time between tokens with causal attention with batch size 4](expected_results/plots/llama32/tpt/causal_4.png)
+
+**Figure 17c - Window attention at batch size 16**
+
+![Mean time between tokens with window attention with batch size 16](expected_results/plots/llama32/tpt/window_16.png)
+
+**For completeness - Window attention at batch size 4**
+
+![Mean time between tokens with window attention with batch size 4](expected_results/plots/llama32/tpt/window_4.png)
+
+**Figure 18 - Block size microbenchmark**
+
+![Block-size microbenchmark](expected_results/plots/llama32/block_size/block.png)
+
+**Figure 19 - Runtime Memory consumption**
+
+![Runtime memory](expected_results/plots/llama32/mem/runtime_mem.png)
+
+**Figure 24 - Compilation time scaling**
+
+![Runtime memory](expected_results/plots/llama32/compilation/compilation_breakdown_multiple.png)
+
 ### Section 7.3 - RL Training (PPO)
 
 **Figure 13 - Small to Medium Scale PPO**
 
@@ -1,23 +1,48 @@
 #! /bin/bash
 
+# Default to not mounting llama volume
+MOUNT_LLAMA=false
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --pull)
+            if ! git diff-index --quiet HEAD --; then
+                echo "Stopping due to uncommitted changes which would prevent pulling. Please commit or stash your changes before running this script."
+                exit 1
+            fi
+            git pull
+            shift
+            ;;
+        --llama32)
+            MOUNT_LLAMA=true
+            shift
+            ;;
+        *)
+            echo "Unknown option $1"
+            echo "Usage: $0 [--pull] [--llama32]"
+            echo "  --pull: Pull latest changes from git before building"
+            echo "  --llama32: Mount ~/.llama volume for llama32 experiments"
+            exit 1
+            ;;
+    esac
+done
+
 # Make sure we are in the repo root
 git_repo_root=$(git rev-parse --show-toplevel)
 pushd $git_repo_root
 
 # Trap to ensure popd is called on exit
 trap 'popd' EXIT
 
-# Check for uncommitted changes and pull if --pull is passed
-if [[ "$1" == "--pull" ]]; then
-    if ! git diff-index --quiet HEAD --; then
-        echo "Stopping due to uncommitted changes which would prevent pulling. Please commit or stash your changes before running this script."
-        exit 1
-    fi
-    git pull
-fi
-
 # Build the container
 DOCKER_BUILDKIT=1 docker build  -f docker/gpu.dockerfile -t tempo-gpu .
 
 # Run the container
-docker run --name tempo-repro --gpus 'all' --ipc=host --ulimit memlock=-1:-1 -it --rm tempo-gpu bash
+if [ "$MOUNT_LLAMA" = true ]; then
+    echo "Mounting ~/.llama volume for llama32 experiments..."
+    docker run --name tempo-repro -v ~/.llama:/home/tempo/.llama --gpus 'all' --ipc=host --ulimit memlock=-1:-1 -it --rm tempo-gpu bash
+else
+    echo "Running container without llama volume mount..."
+    docker run --name tempo-repro --gpus 'all' --ipc=host --ulimit memlock=-1:-1 -it --rm tempo-gpu bash
+fi
@@ -1,5 +1,6 @@
+from collections.abc import Callable
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Tuple, Union
+from typing import Any
 
 import pandas as pd
 
@@ -15,7 +16,7 @@
 """
 
 
-def read_csv(path: str) -> Union[pd.DataFrame, None]:
+def read_csv(path: str) -> pd.DataFrame | None:
     try:
         return pd.read_csv(path)
     except Exception:
@@ -28,7 +29,7 @@ def parse_error_file(error_file_path: str) -> str:
     Returns "OOM" if the error contains memory-related keywords, "MISSING" otherwise.
     """
     try:
-        with open(error_file_path, "r") as f:
+        with open(error_file_path) as f:
             error_content = f.read().lower()
 
         # Keywords that indicate out-of-memory errors
@@ -57,7 +58,7 @@ def parse_error_file(error_file_path: str) -> str:
         return "MISSING"
 
 
-def get_gpu_id_from_run_data(run_data: Dict[str, Any]) -> int:
+def get_gpu_id_from_run_data(run_data: dict[str, Any]) -> int:
     if run_data["monitor"] is not None:
         gpu_mem_col = [col for col in run_data["monitor"].columns if "gpu" in col and "mem" in col]
         if gpu_mem_col:
@@ -66,8 +67,8 @@ def get_gpu_id_from_run_data(run_data: Dict[str, Any]) -> int:
 
 
 def get_single_run_data(
-    path: str, params: Dict[str, Any], name_function: Callable[[str, Dict[str, Any]], str]
-) -> Dict[str, Any]:
+    path: str, params: dict[str, Any], name_function: Callable[[str, dict[str, Any]], str]
+) -> dict[str, Any]:
     # Generate expected experiment name
     expected_name, experiment_path = name_function(path, params)
 
@@ -102,12 +103,12 @@ def get_single_run_data(
 
 def load_sweep_data(
     base_path: str,
-    base_params: Dict[str, Any],
-    sweeps: Dict[str, List[Any]],
-    systems: List[str],
-    name_function: Callable[[Dict[str, Any]], str],
+    base_params: dict[str, Any],
+    sweeps: dict[str, list[Any]],
+    systems: list[str],
+    name_function: Callable[[dict[str, Any]], str],
     caching_allocators: bool = True,
-) -> Dict[str, Dict[Any, Dict[str, Dict[str, Any]]]]:
+) -> dict[str, dict[Any, dict[str, dict[str, Any]]]]:
     """Load data from the experiment results using the naming scheme from shared.py"""
     # Access the small_to_med_scale subpath
     path = Path(base_path)
@@ -138,10 +139,10 @@ def load_sweep_data(
 
 
 def get_sweep_df(
-    data: Dict[str, Dict[Any, Dict[str, Dict[str, Any]]]],
-    sweeps: Dict[str, List[Any]],
+    data: dict[str, dict[Any, dict[str, dict[str, Any]]]],
+    sweeps: dict[str, list[Any]],
     sweep_key: str,
-    systems: List[str],
+    systems: list[str],
 ) -> pd.DataFrame:
     data_list = []
     for sweep_value in sweeps[sweep_key]:
@@ -155,7 +156,7 @@ def get_sweep_df(
 
 
 def has_error(
-    data: Dict[str, Dict[Any, Dict[str, Dict[str, Any]]]],
+    data: dict[str, dict[Any, dict[str, dict[str, Any]]]],
     framework: str,
     sweep_key: str,
     sweep_value,
@@ -174,7 +175,7 @@ def has_error(
 
 
 def get_error_type(
-    data: Dict[str, Dict[Any, Dict[str, Dict[str, Any]]]],
+    data: dict[str, dict[Any, dict[str, dict[str, Any]]]],
     framework: str,
     sweep_key: str,
     sweep_value,
@@ -184,13 +185,13 @@ def get_error_type(
 
 
 def get_normalized_dfs(
-    data: Dict[str, Dict[Any, Dict[str, Dict[str, Any]]]],
+    data: dict[str, dict[Any, dict[str, dict[str, Any]]]],
     framework: str,
     sweep_key: str,
     sweep_value: Any,
     iterations_from_start_to_remove: int = 1,
     iterations_from_end_to_remove: int = 1,
-) -> Tuple[pd.DataFrame, pd.DataFrame]:
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Get normalized dataframes for a specific framework and sweep value"""
     run = data[sweep_key][sweep_value][framework]
     df_monitor = run["monitor"]
@@ -279,14 +280,22 @@ def compute_ratios(df: pd.DataFrame) -> pd.DataFrame:
 
 
 def build_aggregate_metric_df(
-    data: Dict[str, Dict[Any, Dict[str, Dict[str, Any]]]],
+    data: dict[str, dict[Any, dict[str, dict[str, Any]]]],
     sweep_key: str,
     sweep_value: Any,
     sys: str,
-) -> Dict[str, Any]:
+) -> dict[str, Any]:
     error = has_error(data, sys, sweep_key, sweep_value)
     error_type = get_error_type(data, sys, sweep_key, sweep_value)
 
+    if not error:
+        df_monitor, df_log = get_normalized_dfs(data, sys, sweep_key, sweep_value, 0)
+
+    mem_dict = {
+        "gpu_mem_mean": df_monitor["gpu_mem_util"].mean() if not error else 0,
+        "gpu_mem_median": df_monitor["gpu_mem_util"].median() if not error else 0,
+        "gpu_mem_peak": df_monitor["gpu_mem_util"].max() if not error else 0,
+    }
     if not error:
         df_monitor, df_log = get_normalized_dfs(data, sys, sweep_key, sweep_value)
 
@@ -297,9 +306,9 @@ def build_aggregate_metric_df(
         "framework": sys,
         "iter_mean": df_log["elapsed_sec"].diff().mean() if not error else 0,
         "iter_std": df_log["elapsed_sec"].diff().std() if not error else 0,
-        "gpu_mem_mean": df_monitor["gpu_mem_util"].mean() if not error else 0,
-        "gpu_mem_median": df_monitor["gpu_mem_util"].median() if not error else 0,
-        "gpu_mem_peak": df_monitor["gpu_mem_util"].max() if not error else 0,
+        "gpu_mem_mean": mem_dict["gpu_mem_mean"],
+        "gpu_mem_median": mem_dict["gpu_mem_median"],
+        "gpu_mem_peak": mem_dict["gpu_mem_peak"],
         "gpu_util_mean": df_monitor["gpu_util"].mean() if not error else 0,
         "gpu_util_median": df_monitor["gpu_util"].median() if not error else 0,
         "gpu_util_peak": df_monitor["gpu_util"].max() if not error else 0,